From 1a4d279d22e764eadbe3e042623bae5ef579c739 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Tue, 26 May 2026 11:33:57 -0700 Subject: [PATCH 01/27] Add Qwen3 AutoParallel model and examples --- autoparallel/_testing/models/dsv3.py | 2 +- autoparallel/_testing/models/qwen3.py | 976 +++++++++++++++++++++ examples/example_qwen3.py | 242 +++++ examples/example_sanity_check_qwen3.py | 335 +++++++ examples/example_sanity_check_qwen3_moe.py | 466 ++++++++++ examples/example_torchtitan_qwen3_dense.py | 370 ++++++++ tests/test_dsv3_torchtitan_config.py | 35 + tests/test_qwen3.py | 323 +++++++ 8 files changed, 2748 insertions(+), 1 deletion(-) create mode 100644 autoparallel/_testing/models/qwen3.py create mode 100644 examples/example_qwen3.py create mode 100644 examples/example_sanity_check_qwen3.py create mode 100644 examples/example_sanity_check_qwen3_moe.py create mode 100644 examples/example_torchtitan_qwen3_dense.py create mode 100644 tests/test_dsv3_torchtitan_config.py create mode 100644 tests/test_qwen3.py diff --git a/autoparallel/_testing/models/dsv3.py b/autoparallel/_testing/models/dsv3.py index 5a897b71..05f78a92 100644 --- a/autoparallel/_testing/models/dsv3.py +++ b/autoparallel/_testing/models/dsv3.py @@ -1581,7 +1581,7 @@ def __init__( route_norm=moe_cfg.router.route_norm, route_scale=moe_cfg.router.route_scale, score_before_experts=moe_cfg.experts.token_dispatcher.score_before_experts, - use_grouped_mm=moe_cfg.experts.use_grouped_mm, + use_grouped_mm=getattr(moe_cfg.experts, "use_grouped_mm", True), load_balance_coeff=moe_cfg.load_balance_coeff, mesh=mesh, compute_dtype=compute_dtype, diff --git a/autoparallel/_testing/models/qwen3.py b/autoparallel/_testing/models/qwen3.py new file mode 100644 index 00000000..7bef8b17 --- /dev/null +++ b/autoparallel/_testing/models/qwen3.py @@ -0,0 +1,976 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass +from typing import Callable, ClassVar, Optional + +import torch +import torch.nn.functional as F +from torch import nn +from torch.distributed.tensor import DeviceMesh +from torch.distributed.tensor.placement_types import Partial, Replicate, Shard +from torch.fx import traceback as fx_traceback +from torch.nn.attention import sdpa_kernel, SDPBackend + +from autoparallel._testing.models.dsv3 import ( + _permute, + _run_experts_for_loop, + _run_experts_grouped_mm, + _token_combine, +) +from autoparallel.collectives import all_to_all, axis_size, local_map + + +def has_cuda_capability(major: int, minor: int) -> bool: + return torch.cuda.is_available() and torch.cuda.get_device_capability() >= ( + major, + minor, + ) + + +class ScaledDotProductAttention(torch.nn.Module): + backends: ClassVar[list[SDPBackend]] = [] + + def __init__(self, attn_mask_type: str) -> None: + super().__init__() + if attn_mask_type != "causal": + raise ValueError("Qwen3 with SDPA currently only supports causal mask.") + + ScaledDotProductAttention._init_backend() + + @classmethod + def _init_backend(cls) -> None: + if cls.backends: + return + + cls.backends = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, + ] + if has_cuda_capability(10, 0): + cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION) + + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: float | None = None, + ) -> torch.Tensor: + assert self.backends, "SDPA backends should not be empty." + with sdpa_kernel(self.backends, set_priority=True): + return F.scaled_dot_product_attention( + q, + k, + v, + is_causal=True, + scale=scale, + ) + + +def build_attention(attn_mask_type: str): + if attn_mask_type != "causal": + raise ValueError("Qwen3 with SDPA currently only supports causal mask.") + return ScaledDotProductAttention(attn_mask_type) + + +@dataclass +class Qwen3ModelArgs: + dim: int = 4096 + n_layers: int = 36 + n_heads: int = 32 + n_kv_heads: Optional[int] = 8 + head_dim: int = 128 + hidden_dim: int = 12288 + vocab_size: int = 151936 + norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + max_seq_len: int = 4096 + depth_init: bool = True + attn_mask_type: str = "causal" + eos_id: int = 0 + enable_weight_tying: bool = False + moe_enabled: bool = False + moe_hidden_dim: int = 768 + num_experts: int = 64 + top_k: int = 8 + route_norm: bool = True + route_scale: float = 1.0 + score_before_experts: bool = False + use_grouped_mm: bool = True + load_balance_coeff: Optional[float] = 1e-3 + moe_axis_name: str = "ep" + + def __post_init__(self) -> None: + n_kv_heads = self.n_heads if self.n_kv_heads is None else self.n_kv_heads + if self.n_heads % n_kv_heads != 0: + raise ValueError( + f"n_heads ({self.n_heads}) must be divisible by " + f"n_kv_heads ({n_kv_heads})." + ) + if self.moe_enabled and self.top_k > self.num_experts: + raise ValueError( + f"top_k ({self.top_k}) must be <= num_experts ({self.num_experts})." + ) + + def update_from_config(self, job_config, tokenizer) -> None: + self.vocab_size = tokenizer.n_words + self.max_seq_len = job_config.training.seq_len + self.eos_id = tokenizer.eos_id + + def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + nparams = sum(p.numel() for p in model.parameters()) + nparams_embedding = sum( + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.head_dim, + seq_len, + ) + num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + return nparams, num_flops_per_token + + +def qwen3_args_from_torchtitan_config(config) -> Qwen3ModelArgs: + """Build AutoParallel Qwen3 args from TorchTitan's Qwen3Model.Config.""" + if not config.layers: + raise ValueError("Qwen3 config must contain at least one layer.") + + first_layer = config.layers[0] + attention = first_layer.attention + moe = first_layer.moe + + if getattr(attention, "fuse_qkv", False): + raise ValueError("AutoParallel Qwen3 does not support fused QKV yet.") + + moe_enabled = moe is not None + if moe_enabled: + hidden_dim = 0 + moe_hidden_dim = moe.experts.hidden_dim + num_experts = moe.num_experts + top_k = moe.router.top_k + route_norm = moe.router.route_norm + route_scale = moe.router.route_scale + score_before_experts = moe.experts.token_dispatcher.score_before_experts + load_balance_coeff = moe.load_balance_coeff + else: + hidden_dim = first_layer.feed_forward.w1.out_features + moe_hidden_dim = 0 + num_experts = 0 + top_k = 1 + route_norm = True + route_scale = 1.0 + score_before_experts = False + load_balance_coeff = None + + return Qwen3ModelArgs( + dim=config.dim, + n_layers=len(config.layers), + n_heads=attention.n_heads, + n_kv_heads=attention.n_kv_heads, + head_dim=attention.head_dim, + hidden_dim=hidden_dim, + vocab_size=config.vocab_size, + norm_eps=config.norm.eps, + rope_theta=config.rope.theta, + max_seq_len=config.rope.max_seq_len, + attn_mask_type=attention.mask_type, + enable_weight_tying=config.enable_weight_tying, + moe_enabled=moe_enabled, + moe_hidden_dim=moe_hidden_dim, + num_experts=num_experts, + top_k=top_k, + route_norm=route_norm, + route_scale=route_scale, + score_before_experts=score_before_experts, + load_balance_coeff=load_balance_coeff, + ) + + +def qwen3_debug_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=256, + n_layers=8, + n_heads=16, + n_kv_heads=8, + head_dim=128, + hidden_dim=3072, + vocab_size=2048, + max_seq_len=4096, + enable_weight_tying=True, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_0_6b_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=1024, + n_layers=28, + n_heads=16, + n_kv_heads=8, + head_dim=128, + hidden_dim=3072, + vocab_size=151936, + enable_weight_tying=True, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_1_7b_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=2048, + n_layers=28, + n_heads=16, + n_kv_heads=8, + head_dim=128, + hidden_dim=6144, + vocab_size=151936, + enable_weight_tying=True, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_4b_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=2560, + n_layers=36, + n_heads=32, + n_kv_heads=8, + head_dim=128, + hidden_dim=9728, + vocab_size=151936, + enable_weight_tying=True, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_8b_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs() + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_moe_debug_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=256, + n_layers=8, + n_heads=16, + n_kv_heads=8, + head_dim=128, + hidden_dim=3072, + vocab_size=2048, + max_seq_len=4096, + moe_enabled=True, + moe_hidden_dim=768, + num_experts=64, + top_k=8, + route_norm=True, + score_before_experts=False, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_30b_a3b_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=2048, + n_layers=48, + n_heads=32, + n_kv_heads=4, + head_dim=128, + hidden_dim=6144, + vocab_size=151936, + max_seq_len=262144, + moe_enabled=True, + moe_hidden_dim=768, + num_experts=128, + top_k=8, + route_norm=True, + score_before_experts=False, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def qwen3_235b_a22b_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=4096, + n_layers=94, + n_heads=64, + n_kv_heads=4, + head_dim=128, + hidden_dim=12288, + vocab_size=151936, + max_seq_len=4096, + rope_theta=5000000.0, + moe_enabled=True, + moe_hidden_dim=1536, + num_experts=128, + top_k=8, + route_norm=True, + score_before_experts=False, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def precompute_freqs_cos_sin( + dim: int, + max_seq_len: int, + theta: float = 1000000.0, +) -> torch.Tensor: + freq = theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) + inv_freq = 1.0 / freq + t = torch.arange(max_seq_len, dtype=inv_freq.dtype, device=inv_freq.device) + freqs = torch.outer(t, inv_freq).float() + freqs = torch.cat([freqs, freqs], dim=-1) + cos = freqs.cos() + sin = freqs.sin() + return torch.cat([cos, sin], dim=-1) + + +def reshape_for_broadcast_cos_sin( + rope_cache: torch.Tensor, + x: torch.Tensor, +) -> torch.Tensor: + bsz, seqlen, _, head_dim = x.shape + rope_cache = rope_cache[0:seqlen] + assert rope_cache.shape == (seqlen, head_dim * 2) + return rope_cache.view(1, seqlen, 1, head_dim * 2).expand(bsz, -1, -1, -1) + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_emb_cos_sin( + xq: torch.Tensor, + xk: torch.Tensor, + rope_cache: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + head_dim = xq.shape[-1] + rope_cache = reshape_for_broadcast_cos_sin(rope_cache, xq) + cos = rope_cache[..., :head_dim].to(device=xq.device) + sin = rope_cache[..., head_dim:].to(device=xq.device) + xq_f = xq.float() + xk_f = xk.float() + xq_out = (xq_f * cos) + (_rotate_half(xq_f) * sin) + xk_out = (xk_f * cos) + (_rotate_half(xk_f) * sin) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + torch.unsqueeze(x, dim=3) + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + + +def _to_activation_device(tensor: torch.Tensor, activation: torch.Tensor) -> torch.Tensor: + if tensor.device != activation.device and tensor.device.type == "meta": + return tensor.to(activation.device) + return tensor + + +def _rms_norm(x: torch.Tensor, norm: nn.RMSNorm) -> torch.Tensor: + weight = ( + _to_activation_device(norm.weight, x) + if norm.weight is not None + else None + ) + if weight is not None and weight.dtype != x.dtype: + weight = weight.to(dtype=x.dtype) + return F.rms_norm(x, norm.normalized_shape, weight, norm.eps).to(dtype=x.dtype) + + +def _linear(x: torch.Tensor, linear: nn.Linear) -> torch.Tensor: + weight = _to_activation_device(linear.weight, x) + bias = ( + _to_activation_device(linear.bias, x) + if linear.bias is not None + else None + ) + if weight.dtype != x.dtype: + weight = weight.to(dtype=x.dtype) + if bias is not None and bias.dtype != x.dtype: + bias = bias.to(dtype=x.dtype) + return F.linear(x, weight, bias) + + +class Attention(nn.Module): + def __init__(self, model_args: Qwen3ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.n_kv_heads = ( + model_args.n_heads + if model_args.n_kv_heads is None + else model_args.n_kv_heads + ) + self.n_rep = self.n_heads // self.n_kv_heads + self.head_dim = model_args.head_dim + self.scale = self.head_dim**-0.5 + + self.wq = nn.Linear( + model_args.dim, model_args.n_heads * self.head_dim, bias=False + ) + self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear( + model_args.n_heads * self.head_dim, model_args.dim, bias=False + ) + self.q_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps) + self.k_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps) + self.sdpa = build_attention(model_args.attn_mask_type) + + def init_weights(self, init_std: float): + for linear in (self.wq, self.wk, self.wv): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std) + self.q_norm.reset_parameters() + self.k_norm.reset_parameters() + + def forward( + self, + x: torch.Tensor, + freqs_cos_sin: torch.Tensor, + ): + bs, seqlen, _ = x.shape + xq, xk, xv = _linear(x, self.wq), _linear(x, self.wk), _linear(x, self.wv) + + xq = xq.view(bs, seqlen, -1, self.head_dim) + xk = xk.view(bs, seqlen, -1, self.head_dim) + xv = xv.view(bs, seqlen, -1, self.head_dim) + + xq = _rms_norm(xq, self.q_norm) + xk = _rms_norm(xk, self.k_norm) + freqs_cos_sin = _to_activation_device(freqs_cos_sin, xq) + xq, xk = apply_rotary_emb_cos_sin(xq, xk, freqs_cos_sin) + + keys = repeat_kv(xk, self.n_rep) + values = repeat_kv(xv, self.n_rep) + + xq = xq.transpose(1, 2) + xk = keys.transpose(1, 2) + xv = values.transpose(1, 2) + + output = self.sdpa(xq, xk, xv, scale=self.scale) + + output = output.transpose(1, 2).contiguous() + output = output.view(bs, seqlen, -1) + return _linear(output, self.wo) + + +class FeedForward(nn.Module): + def __init__(self, dim: int, hidden_dim: int): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return _linear(F.silu(_linear(x, self.w1)) * _linear(x, self.w3), self.w2) + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) + for linear in (self.w2, self.w3): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + +class GroupedExperts(nn.Module): + def __init__( + self, + dim: int, + hidden_dim: int, + num_experts: int, + use_grouped_mm: bool, + ): + super().__init__() + self.num_experts = num_experts + self.w1 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim)) + self.w2 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim)) + self.w3 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim)) + self.use_grouped_mm = use_grouped_mm + + def forward( + self, + x: torch.Tensor, + num_tokens_per_expert: torch.Tensor, + ) -> torch.Tensor: + if self.use_grouped_mm: + return _run_experts_grouped_mm( + self.w1, self.w2, self.w3, x, num_tokens_per_expert + ) + return _run_experts_for_loop( + self.w1, self.w2, self.w3, x, num_tokens_per_expert + ) + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02) + nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std) + nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std) + + +def _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name): + ep_size = axis_size(axis_name) + num_tokens_per_expert_group = all_to_all( + num_tokens_per_expert, + None, + None, + axis_name, + ) + + with torch.no_grad(): + input_splits = ( + num_tokens_per_expert.view(ep_size, -1) + .sum(dim=1) + .to(torch.device("cpu"), non_blocking=True) + ) + output_splits = ( + num_tokens_per_expert_group.view(ep_size, -1) + .sum(dim=1) + .to(torch.device("cpu"), non_blocking=False) + ) + input_splits = input_splits.tolist() + output_splits = output_splits.tolist() + + with fx_traceback.annotate({"comm_region": "token_dispatch"}): + routed_input = all_to_all( + routed_input, + output_splits, + input_splits, + axis_name, + ) + + num_local_experts = num_tokens_per_expert_group.shape[0] // ep_size + return ( + *_permute( + routed_input, + num_tokens_per_expert_group, + ep_size, + num_local_experts, + ), + input_splits, + output_splits, + ) + + +def qwen3_moe_local_mapped_region( + x: torch.Tensor, + selected_experts_indices: torch.Tensor, + top_scores: torch.Tensor, + experts_w1: torch.Tensor, + experts_w3: torch.Tensor, + experts_w2: torch.Tensor, + out: torch.Tensor, + top_k: int, + num_experts: int, + score_before_experts: bool, + axis_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + dim = x.shape[-1] + ep_size = axis_size(axis_name) + if num_experts % ep_size != 0: + raise ValueError( + f"num_experts ({num_experts}) must be divisible by " + f"axis_size({axis_name!r}) ({ep_size})." + ) + + num_tokens_per_expert = torch.histc( + selected_experts_indices.flatten(), + bins=num_experts, + min=0, + max=num_experts, + ).view(-1) + + token_indices_experts_sorted = torch.argsort( + selected_experts_indices.view(-1), stable=True + ) + top_scores_experts_sorted = top_scores.view(-1)[token_indices_experts_sorted] + token_indices_experts_sorted = token_indices_experts_sorted // top_k + + routed_input = x[token_indices_experts_sorted] + if score_before_experts: + routed_input = ( + routed_input.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1) + ).to(x.dtype) + + shape = routed_input.shape + ( + input_shape, + routed_input, + permuted_indices, + num_tokens_per_expert_group, + input_splits, + output_splits, + ) = _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name) + + routed_output = _run_experts_grouped_mm( + experts_w1, + experts_w2, + experts_w3, + routed_input, + num_tokens_per_expert_group, + ) + routed_output = _token_combine( + routed_output, + input_shape, + permuted_indices, + input_splits, + output_splits, + axis_name, + ) + + torch._check(routed_output.shape[0] == shape[0]) + if not score_before_experts: + routed_output = ( + routed_output.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1) + ).to(routed_output.dtype) + + out = out.scatter_add( + dim=0, + index=token_indices_experts_sorted.reshape(-1, 1).expand(-1, dim), + src=routed_output, + ) + return out, num_tokens_per_expert + +class MoE(nn.Module): + def __init__( + self, + model_args: Qwen3ModelArgs, + mesh: DeviceMesh | None = None, + axis_name: str | None = None, + ): + super().__init__() + self.mesh = mesh + self.axis_name = axis_name or model_args.moe_axis_name + self.num_experts = model_args.num_experts + self.top_k = model_args.top_k + self.route_norm = model_args.route_norm + self.route_scale = model_args.route_scale + self.score_before_experts = model_args.score_before_experts + self.load_balance_coeff = model_args.load_balance_coeff + + self.router = nn.Linear(model_args.dim, model_args.num_experts, bias=False) + self.experts = GroupedExperts( + dim=model_args.dim, + hidden_dim=model_args.moe_hidden_dim, + num_experts=model_args.num_experts, + use_grouped_mm=model_args.use_grouped_mm, + ) + self.register_buffer( + "expert_bias", + torch.zeros(model_args.num_experts, dtype=torch.float32), + persistent=self.load_balance_coeff is not None, + ) + self.register_buffer( + "tokens_per_expert", + torch.zeros(model_args.num_experts, dtype=torch.float32), + persistent=False, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + bs, slen, dim = x.shape + x = x.view(-1, dim) + experts_w1, experts_w2, experts_w3 = self.experts.parameters() + experts_w1 = _to_activation_device(experts_w1, x) + experts_w2 = _to_activation_device(experts_w2, x) + experts_w3 = _to_activation_device(experts_w3, x) + + scores = F.linear( + x.to(torch.float32), + _to_activation_device(self.router.weight, x).to(torch.float32), + None, + ) + scores = F.softmax(scores, dim=-1) + expert_bias = _to_activation_device(self.expert_bias, scores) + scores_for_choice = ( + scores + expert_bias + if self.load_balance_coeff is not None + else scores + ) + _, selected_experts_indices = torch.topk( + scores_for_choice, + k=self.top_k, + dim=-1, + sorted=False, + ) + + top_scores = scores.gather(dim=-1, index=selected_experts_indices) + if self.route_norm: + denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20 + top_scores = top_scores / denominator + top_scores = top_scores * self.route_scale + + # Qwen3 MoE has no shared expert path, but keeping the initial output + # differentiably tied to x matches the DSv3 local_map autograd shape. + out = x * 0 + out, num_tokens_per_expert = local_map( + qwen3_moe_local_mapped_region, + out_placements=( + (Shard(0), Shard(0)), + (Partial(reduce_op="sum"), Partial(reduce_op="sum")), + ), + in_placements=( + (Shard(0), Shard(0)), + (Shard(0), Shard(0)), + (Shard(0), Shard(0)), + (Replicate(), Shard(0)), + (Replicate(), Shard(0)), + (Replicate(), Shard(0)), + (Shard(0), Shard(0)), + None, + None, + None, + None, + ), + redistribute_inputs=True, + in_grad_placements=None, + device_mesh=self.mesh, + )( + x, + selected_experts_indices, + top_scores, + experts_w1, + experts_w3, + experts_w2, + out, + self.top_k, + self.num_experts, + self.score_before_experts, + self.axis_name, + ) + # This counter is only used for runtime load-balance diagnostics. During + # AutoParallel graph capture the module buffers are fake/meta tensors + # while the traced local_map output can be CUDA-fake, and recording this + # mutation is not needed for the solved training graph. + if not torch.compiler.is_compiling(): + with torch.no_grad(): + self.tokens_per_expert.add_(num_tokens_per_expert) # type: ignore[operator] + return out.reshape(bs, slen, dim) + + def init_weights( + self, + init_std: float, + buffer_device: torch.device, + ): + nn.init.trunc_normal_(self.router.weight, mean=0.0, std=init_std) + self.experts.init_weights(init_std) + with torch.device(buffer_device): + self.tokens_per_expert.zero_() # type: ignore[operator] + self.expert_bias.zero_() # type: ignore[operator] + + +class TransformerBlock(nn.Module): + def __init__( + self, + layer_id: int, + model_args: Qwen3ModelArgs, + mesh: DeviceMesh | None = None, + moe_axis_name: str | None = None, + ): + super().__init__() + self.attention = Attention(model_args) + self.moe_enabled = model_args.moe_enabled + if self.moe_enabled: + self.moe = MoE(model_args, mesh=mesh, axis_name=moe_axis_name) + else: + self.feed_forward = FeedForward( + dim=model_args.dim, + hidden_dim=model_args.hidden_dim, + ) + self.attention_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps) + self.ffn_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps) + + if model_args.depth_init: + self.weight_init_std = 0.02 / math.sqrt(2 * (layer_id + 1)) + else: + self.weight_init_std = 0.02 / math.sqrt(2 * model_args.n_layers) + + def forward( + self, + x: torch.Tensor, + freqs_cos_sin: torch.Tensor, + ): + h = x + self.attention(_rms_norm(x, self.attention_norm), freqs_cos_sin) + if self.moe_enabled: + out = h + self.moe(_rms_norm(h, self.ffn_norm)) + else: + out = h + self.feed_forward(_rms_norm(h, self.ffn_norm)) + return out + + def init_weights(self, buffer_device: torch.device): + for norm in (self.attention_norm, self.ffn_norm): + norm.reset_parameters() + self.attention.init_weights(self.weight_init_std) + if self.moe_enabled: + self.moe.init_weights(self.weight_init_std, buffer_device) + else: + self.feed_forward.init_weights(self.weight_init_std) + + +class Transformer(nn.Module): + def __init__( + self, + model_args: Qwen3ModelArgs, + mesh: DeviceMesh | None = None, + moe_axis_name: str | None = None, + ): + super().__init__() + self.model_args = model_args + self.vocab_size = model_args.vocab_size + self.n_layers = model_args.n_layers + self.eos_id = model_args.eos_id + self.enable_weight_tying = model_args.enable_weight_tying + self.mesh = mesh + self.moe_axis_name = moe_axis_name or model_args.moe_axis_name + + self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) + self.register_buffer( + "freqs_cos_sin", + self._precompute_freqs_cos_sin(), + persistent=True, + ) + + self.layers = torch.nn.ModuleDict() + for layer_id in range(model_args.n_layers): + self.layers[str(layer_id)] = TransformerBlock( + layer_id, + model_args, + mesh=mesh, + moe_axis_name=self.moe_axis_name, + ) + self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps) + self.lm_head = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) + + if self.enable_weight_tying: + self.tok_embeddings.weight = self.lm_head.weight + + def init_weights( + self, + buffer_device: Optional[torch.device] = None, + seed: int | None = None, + ): + if seed is not None: + torch.manual_seed(seed) + + if self.enable_weight_tying: + self.tok_embeddings.weight = self.lm_head.weight + + buffer_device = buffer_device or self.freqs_cos_sin.device # type: ignore[assignment] + with torch.device(buffer_device): # type: ignore[arg-type] + self.freqs_cos_sin = self._precompute_freqs_cos_sin() + + if not self.enable_weight_tying and self.tok_embeddings is not None: + nn.init.normal_(self.tok_embeddings.weight) + for layer in self.layers.values(): + if layer is not None: + layer.init_weights(buffer_device) # type: ignore[operator] + if self.norm is not None: + self.norm.reset_parameters() + + final_out_std = self.model_args.dim**-0.5 + cutoff_factor = 3 + if self.lm_head is not None: + nn.init.trunc_normal_( + self.lm_head.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + + if self.enable_weight_tying: + self.tok_embeddings.weight = self.lm_head.weight + + def _precompute_freqs_cos_sin(self) -> torch.Tensor: + return precompute_freqs_cos_sin( + self.model_args.head_dim, + self.model_args.max_seq_len, + self.model_args.rope_theta, + ) + + def _token_embedding(self, tokens: torch.Tensor) -> torch.Tensor: + weight = self.tok_embeddings.weight + if weight.device != tokens.device and weight.device.type == "meta": + weight = weight.to(tokens.device) + return F.embedding(tokens, weight) + + def forward(self, tokens: torch.Tensor, input_batch: Optional[torch.Tensor] = None): + h = self._token_embedding(tokens) if self.tok_embeddings is not None else tokens + + for layer in self.layers.values(): + h = layer(h, self.freqs_cos_sin) + + h = _rms_norm(h, self.norm) if self.norm is not None else h + output = _linear(h, self.lm_head) if self.lm_head is not None else h + return output + + +_MODULE_FQN = "module_fqn" + + +def _annotate_once(fn: Callable, meta: dict): + if getattr(fn, "_graph_trainer_annotated", False): + return fn + wrapped = fx_traceback.annotate_fn(meta)(fn) + setattr(wrapped, "_graph_trainer_annotated", True) + return wrapped + + +def _annotate_module_fqns(model: nn.Module) -> None: + for fqn, submodule in model.named_modules(): + if fqn: + submodule.forward = _annotate_once( + submodule.forward, + {_MODULE_FQN: fqn}, + ) + + +def annotate_qwen3_for_graph_trainer(model: Transformer) -> None: + """Attach graph_trainer-compatible FX annotations to AP's Qwen3 model.""" + global qwen3_moe_local_mapped_region + + qwen3_moe_local_mapped_region = _annotate_once( + qwen3_moe_local_mapped_region, + {"EP": "compute"}, + ) + MoE.forward = _annotate_once( # type: ignore[method-assign] + MoE.forward, + {"EP": "compute"}, + ) + _annotate_module_fqns(model) diff --git a/examples/example_qwen3.py b/examples/example_qwen3.py new file mode 100644 index 00000000..2ae57b00 --- /dev/null +++ b/examples/example_qwen3.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import time + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.qwen3 import ( + Qwen3ModelArgs, + Transformer, + qwen3_235b_a22b_args, + qwen3_30b_a3b_args, + qwen3_8b_args, + qwen3_debug_args, + qwen3_moe_debug_args, +) +from autoparallel.api import AutoParallel +from autoparallel.compile import autoparallel_backend + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Trace, optimize, and smoke-test dense Qwen3 with AutoParallel." + ) + parser.add_argument( + "--flavor", + choices=("tiny", "moe-tiny", "debug", "8b", "moe-debug", "30b-a3b", "235b-a22b"), + default="tiny", + help="Qwen3 model size to instantiate. Defaults to tiny for faster runs.", + ) + parser.add_argument( + "--seq-len", + type=int, + default=None, + help="Sequence length. Defaults to 8 for tiny, 512 for debug, and 4096 for 8b.", + ) + parser.add_argument( + "--world-size", + type=int, + default=64, + help="Fake process-group world size.", + ) + parser.add_argument( + "--tp-degree", + type=int, + default=8, + help="Second mesh degree. Used as TP for dense flavors and EP for MoE flavors.", + ) + parser.add_argument( + "--local-batch-size", + type=int, + default=2, + help="Per-DP-rank batch size used for the runtime smoke pass.", + ) + parser.add_argument( + "--save-optimizer", + type=str, + default=None, + help="Optional path for the serialized sharding optimizer state.", + ) + parser.add_argument( + "--compile", + action="store_true", + help="Compile the placed module with the AutoParallel backend before running.", + ) + parser.add_argument( + "--skip-run", + action="store_true", + help="Only run tracing, optimization, and placement application.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print the full AutoParallel optimizer log.", + ) + return parser.parse_args() + + +def make_model_args(flavor: str, seq_len: int): + if flavor == "tiny": + return Qwen3ModelArgs( + dim=64, + n_layers=2, + n_heads=4, + n_kv_heads=2, + head_dim=16, + hidden_dim=128, + vocab_size=128, + max_seq_len=seq_len, + ) + if flavor == "moe-tiny": + return Qwen3ModelArgs( + dim=64, + n_layers=1, + n_heads=4, + n_kv_heads=2, + head_dim=16, + hidden_dim=128, + vocab_size=128, + max_seq_len=seq_len, + moe_enabled=True, + moe_hidden_dim=32, + num_experts=8, + top_k=2, + route_norm=True, + score_before_experts=False, + ) + if flavor == "debug": + return qwen3_debug_args(max_seq_len=seq_len) + if flavor == "8b": + return qwen3_8b_args(max_seq_len=seq_len) + if flavor == "moe-debug": + return qwen3_moe_debug_args(max_seq_len=seq_len) + if flavor == "30b-a3b": + return qwen3_30b_a3b_args(max_seq_len=seq_len) + if flavor == "235b-a22b": + return qwen3_235b_a22b_args(max_seq_len=seq_len) + raise ValueError(f"Unknown Qwen3 flavor: {flavor}") + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.DEBUG) + + seq_len = args.seq_len + if seq_len is None: + seq_len = { + "tiny": 8, + "moe-tiny": 8, + "debug": 512, + "8b": 4096, + "moe-debug": 512, + "30b-a3b": 4096, + "235b-a22b": 4096, + }[args.flavor] + if args.world_size % args.tp_degree != 0: + raise ValueError( + f"world-size ({args.world_size}) must be divisible by " + f"tp-degree ({args.tp_degree})." + ) + + if not torch.distributed.is_initialized(): + fake_store = FakeStore() + torch.distributed.init_process_group( + "fake", + store=fake_store, + rank=0, + world_size=args.world_size, + ) + + model_args = make_model_args(args.flavor, seq_len) + mesh_dim_names = ("dp", "ep") if model_args.moe_enabled else ("dp", "tp") + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + (args.world_size // args.tp_degree, args.tp_degree), + mesh_dim_names=mesh_dim_names, + ) + device = torch.device("cuda") + + global_batch_size = args.local_batch_size * mesh.shape[0] + if model_args.moe_enabled: + global_batch_size *= mesh.shape[1] + + with torch.device("meta"): + model = Transformer( + model_args, + mesh=mesh if model_args.moe_enabled else None, + moe_axis_name=mesh.mesh_dim_names[1], + ) + + def input_fn(): + return torch.randint( + 0, + model_args.vocab_size, + (global_batch_size, seq_len), + device=device, + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + + t0 = time.time() + with AutoParallel( + model, + input_fn, + mesh, + mp_policy, + dynamic=model_args.moe_enabled, + repeated_subgraphs=True, + ) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + + x_sharding = (Shard(0), Shard(0)) if model_args.moe_enabled else (Shard(0), Replicate()) + out_sharding = (Shard(0), Shard(2)) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([out_sharding]) + + sharding_placement = autop.optimize_placement(verbose=args.verbose) + print(f"Tracing + optimization took {time.time() - t0:.1f}s") + + if args.save_optimizer is not None: + autop.sharding_optimizer.save(args.save_optimizer) + autop.sharding_optimizer.save_placements( + f"{args.save_optimizer}.placements.json" + ) + + parallel_mod = autop.apply_placement(sharding_placement) + + if args.skip_run: + print("Placement applied successfully.") + return + + parallel_mod.to_empty(device=device) + parallel_mod.init_weights(buffer_device=device) # type: ignore[operator] + + if args.compile: + parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) + + tokens = torch.randint( + 0, + model_args.vocab_size, + (args.local_batch_size, seq_len), + device=device, + ) + out = parallel_mod(tokens) + if torch.any(torch.isnan(out)): + raise RuntimeError("Found NaNs in Qwen3 forward output.") + out.backward(torch.randn_like(out)) + print("All good!") + + +if __name__ == "__main__": + main() diff --git a/examples/example_sanity_check_qwen3.py b/examples/example_sanity_check_qwen3.py new file mode 100644 index 00000000..b7af6c0d --- /dev/null +++ b/examples/example_sanity_check_qwen3.py @@ -0,0 +1,335 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +import time + +import torch +import torch.distributed as dist +import torch.distributed.nn.functional as dist_nn_func +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel._testing.models.qwen3 import Transformer, qwen3_8b_args +from autoparallel.api import AutoParallel +from autoparallel.compile import autoparallel_backend + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Run a real Qwen3 8B AutoParallel training sanity check." + ) + parser.add_argument( + "--global-batch-size", + type=int, + default=16, + help="Global batch size across data-parallel ranks.", + ) + parser.add_argument( + "--microbatch-size", + type=int, + default=1, + help="Per-DP-rank microbatch size for gradient accumulation.", + ) + parser.add_argument( + "--seq-len", + type=int, + default=4096, + help="Sequence length. Defaults to Qwen3 8B's max sequence length.", + ) + parser.add_argument( + "--dp-degree", + type=int, + default=2, + help="Data-parallel mesh degree.", + ) + parser.add_argument( + "--tp-degree", + type=int, + default=2, + help="Tensor-parallel mesh degree.", + ) + parser.add_argument( + "--train-steps", + type=int, + default=20, + help="Number of optimizer steps.", + ) + parser.add_argument( + "--lr", + type=float, + default=3e-4, + help="AdamW learning rate.", + ) + parser.add_argument( + "--max-grad-norm", + type=float, + default=1.0, + help="Gradient clipping max norm.", + ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="Seed for model initialization and synthetic data generation.", + ) + parser.add_argument( + "--compile", + action="store_true", + help="Compile the placed module with the AutoParallel backend before training.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print the full AutoParallel optimizer log.", + ) + return parser.parse_args() + + +def init_distributed(args): + if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: + raise RuntimeError( + "Run this example with torchrun, e.g. " + "torchrun --standalone --nproc-per-node 4 " + "examples/example_sanity_check_qwen3.py" + ) + + world_size = int(os.environ["WORLD_SIZE"]) + local_rank = int(os.environ["LOCAL_RANK"]) + expected_world_size = args.dp_degree * args.tp_degree + if world_size != expected_world_size: + raise ValueError( + f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree " + f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})." + ) + if args.global_batch_size % args.dp_degree != 0: + raise ValueError( + f"global-batch-size ({args.global_batch_size}) must be divisible by " + f"dp-degree ({args.dp_degree})." + ) + local_batch_size = args.global_batch_size // args.dp_degree + if local_batch_size % args.microbatch_size != 0: + raise ValueError( + f"local batch size ({local_batch_size}) must be divisible by " + f"microbatch-size ({args.microbatch_size})." + ) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + dist.init_process_group("nccl", device_id=device) + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + (args.dp_degree, args.tp_degree), + mesh_dim_names=("dp", "tp"), + ) + return device, mesh + + +def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor: + coordinate = mesh.get_coordinate() + if coordinate is None: + raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.") + dp_rank, _tp_rank = coordinate + local_batch_size = args.global_batch_size // args.dp_degree + + generator = torch.Generator(device="cpu") + generator.manual_seed(args.seed) + tokens = torch.randint( + 0, + vocab_size, + (args.global_batch_size, args.seq_len + 1), + generator=generator, + dtype=torch.long, + ) + + start = dp_rank * local_batch_size + stop = start + local_batch_size + return tokens[start:stop].to(device, non_blocking=True) + + +def vocab_parallel_cross_entropy( + logits: torch.Tensor, + labels: torch.Tensor, + *, + vocab_size: int, + tp_group, + tp_rank: int, + tp_degree: int, + global_token_count: int, +) -> torch.Tensor: + if logits.shape[:2] != labels.shape: + raise ValueError( + f"logits shape {tuple(logits.shape)} is incompatible with " + f"labels shape {tuple(labels.shape)}." + ) + + local_vocab_size = logits.shape[-1] + vocab_start = tp_rank * local_vocab_size + vocab_stop = vocab_start + local_vocab_size + if tp_rank == tp_degree - 1: + vocab_stop = vocab_size + + logits = logits.float() + local_max = logits.amax(dim=-1) + with torch.no_grad(): + global_max = local_max.detach().clone() + dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group) + + shifted_logits = logits - global_max.unsqueeze(-1) + local_exp_sum = shifted_logits.exp().sum(dim=-1) + global_exp_sum = dist_nn_func.all_reduce( + local_exp_sum, + op=dist.ReduceOp.SUM, + group=tp_group, + ) + + target_mask = (labels >= vocab_start) & (labels < vocab_stop) + local_target = torch.zeros_like(labels, dtype=torch.long) + local_target[target_mask] = labels[target_mask] - vocab_start + local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) + local_target_logits = local_target_logits * target_mask.to(logits.dtype) + target_logits = dist_nn_func.all_reduce( + local_target_logits, + op=dist.ReduceOp.SUM, + group=tp_group, + ) + + loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() + return loss_sum / (global_token_count * tp_degree) + + +def print_rank0(message: str) -> None: + if dist.get_rank() == 0: + print(message, flush=True) + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.DEBUG) + + device, mesh = init_distributed(args) + tp_group = mesh.get_group("tp") + tp_rank = mesh.get_local_rank("tp") + local_batch_size = args.global_batch_size // args.dp_degree + gradient_accumulation_steps = local_batch_size // args.microbatch_size + + torch.manual_seed(args.seed) + model_args = qwen3_8b_args(max_seq_len=args.seq_len) + trace_global_batch_size = args.microbatch_size * args.dp_degree + + with torch.device("meta"): + model = Transformer(model_args) + + def input_fn(): + return torch.randint( + 0, + model_args.vocab_size, + (trace_global_batch_size, args.seq_len), + device=device, + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + + print_rank0( + "Qwen3 8B sanity check: " + f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), " + f"global_batch={args.global_batch_size}, " + f"local_batch={local_batch_size}, " + f"microbatch={args.microbatch_size}, " + f"grad_accum={gradient_accumulation_steps}, " + f"trace_global_batch={trace_global_batch_size}, " + f"seq_len={args.seq_len}" + ) + + t0 = time.time() + with AutoParallel( + model, + input_fn, + mesh, + mp_policy, + repeated_subgraphs=True, + ) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0), Replicate())]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + sharding_placement = autop.optimize_placement(verbose=args.verbose) + parallel_mod = autop.apply_placement(sharding_placement) + + print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s") + + parallel_mod.to_empty(device=device) + parallel_mod.init_weights(buffer_device=device, seed=args.seed) # type: ignore[operator] + + if args.compile: + parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) + + batch = make_local_tokens(args, mesh, device, model_args.vocab_size) + inputs = batch[:, :-1].contiguous() + labels = batch[:, 1:].contiguous() + input_microbatches = inputs.split(args.microbatch_size, dim=0) + label_microbatches = labels.split(args.microbatch_size, dim=0) + global_token_count = args.global_batch_size * args.seq_len + optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) + + try: + losses: list[float] = [] + for step in range(args.train_steps): + optimizer.zero_grad(set_to_none=True) + step_loss = torch.zeros((), device=device) + for micro_inputs, micro_labels in zip( + input_microbatches, label_microbatches + ): + logits = parallel_mod(micro_inputs) + if torch.any(torch.isnan(logits)): + raise RuntimeError("Found NaNs in Qwen3 forward output.") + + loss = vocab_parallel_cross_entropy( + logits, + micro_labels, + vocab_size=model_args.vocab_size, + tp_group=tp_group, + tp_rank=tp_rank, + tp_degree=args.tp_degree, + global_token_count=global_token_count, + ) + if torch.any(torch.isnan(loss)): + raise RuntimeError("Found NaNs in Qwen3 training loss.") + + loss.backward() + step_loss = step_loss + loss.detach() + + torch.nn.utils.clip_grad_norm_( + parallel_mod.parameters(), args.max_grad_norm + ) + optimizer.step() + + with torch.no_grad(): + logged_loss = step_loss.clone() + dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM) + loss_value = float(logged_loss.item()) + losses.append(loss_value) + print_rank0(f"step={step:03d} loss={loss_value:.6f}") + + if losses[-1] >= losses[0]: + raise RuntimeError( + f"Qwen3 training loss did not improve: initial={losses[0]:.6f}, " + f"final={losses[-1]:.6f}" + ) + + print_rank0(f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}") + dist.barrier(device_ids=[device.index]) + torch.cuda.synchronize(device) + finally: + if dist.is_initialized(): + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/examples/example_sanity_check_qwen3_moe.py b/examples/example_sanity_check_qwen3_moe.py new file mode 100644 index 00000000..dd16afb7 --- /dev/null +++ b/examples/example_sanity_check_qwen3_moe.py @@ -0,0 +1,466 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +import time + +import torch +import torch.distributed as dist +import torch.distributed.nn.functional as dist_nn_func +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Shard + +from autoparallel._testing.models.qwen3 import ( + Qwen3ModelArgs, + Transformer, + qwen3_235b_a22b_args, + qwen3_30b_a3b_args, + qwen3_moe_debug_args, +) +from autoparallel.api import AutoParallel +from autoparallel.compile import autoparallel_backend + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Run a real Qwen3 MoE AutoParallel training sanity check." + ) + parser.add_argument( + "--flavor", + choices=("moe-tiny", "moe-debug", "30b-a3b", "235b-a22b"), + default="30b-a3b", + help="Qwen3 MoE model size. Defaults to the real Qwen3-30B-A3B model.", + ) + parser.add_argument( + "--global-batch-size", + type=int, + default=4, + help="Global batch size across data-parallel ranks.", + ) + parser.add_argument( + "--microbatch-size", + type=int, + default=1, + help="Per-rank input microbatch size before EP all-gather inside the model.", + ) + parser.add_argument( + "--seq-len", + type=int, + default=8192, + help="Sequence length. Defaults to 8192 for the 4xH100 sanity run.", + ) + parser.add_argument( + "--dp-degree", + type=int, + default=2, + help="Data-parallel mesh degree.", + ) + parser.add_argument( + "--ep-degree", + type=int, + default=2, + help="Expert-parallel mesh degree.", + ) + parser.add_argument( + "--train-steps", + type=int, + default=30, + help="Number of optimizer steps.", + ) + parser.add_argument( + "--lr", + type=float, + default=3e-4, + help="Optimizer learning rate.", + ) + parser.add_argument( + "--optimizer", + choices=("adamw", "sgd", "none"), + default="adamw", + help="Optimizer to use after backward. Use sgd/none for large-model memory smoke runs.", + ) + parser.add_argument( + "--max-grad-norm", + type=float, + default=1.0, + help="Gradient clipping max norm.", + ) + parser.add_argument( + "--loss-chunk-size", + type=int, + default=512, + help=( + "Sequence chunk size for vocab-parallel cross entropy. " + "Keeps the 8192-token real-model run from materializing full-size " + "float logits and exp buffers at once." + ), + ) + parser.add_argument( + "--skip-loss-improvement-check", + action="store_true", + help="Only require finite forward/backward/optimizer steps.", + ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="Seed for model initialization and synthetic data generation.", + ) + parser.add_argument( + "--compile", + action="store_true", + help="Compile the placed module with the AutoParallel backend before training.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print the full AutoParallel optimizer log.", + ) + return parser.parse_args() + + +def make_model_args(flavor: str, seq_len: int | None) -> Qwen3ModelArgs: + if flavor == "moe-tiny": + max_seq_len = 512 if seq_len is None else seq_len + return Qwen3ModelArgs( + dim=64, + n_layers=1, + n_heads=4, + n_kv_heads=2, + head_dim=16, + hidden_dim=128, + vocab_size=128, + max_seq_len=max_seq_len, + moe_enabled=True, + moe_hidden_dim=32, + num_experts=8, + top_k=2, + route_norm=True, + score_before_experts=False, + moe_axis_name="ep", + ) + overrides = {"moe_axis_name": "ep"} + if seq_len is not None: + overrides["max_seq_len"] = seq_len + if flavor == "moe-debug": + return qwen3_moe_debug_args(**overrides) + if flavor == "30b-a3b": + return qwen3_30b_a3b_args(**overrides) + if flavor == "235b-a22b": + return qwen3_235b_a22b_args(**overrides) + raise ValueError(f"Unknown Qwen3 MoE flavor: {flavor}") + + +def init_distributed(args): + if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: + raise RuntimeError( + "Run this example with torchrun, e.g. " + "torchrun --standalone --nproc-per-node 4 " + "examples/example_sanity_check_qwen3_moe.py" + ) + + world_size = int(os.environ["WORLD_SIZE"]) + local_rank = int(os.environ["LOCAL_RANK"]) + expected_world_size = args.dp_degree * args.ep_degree + if world_size != expected_world_size: + raise ValueError( + f"WORLD_SIZE ({world_size}) must equal dp-degree * ep-degree " + f"({args.dp_degree} * {args.ep_degree} = {expected_world_size})." + ) + if args.global_batch_size % args.dp_degree != 0: + raise ValueError( + f"global-batch-size ({args.global_batch_size}) must be divisible by " + f"dp-degree ({args.dp_degree})." + ) + + local_dp_batch_size = args.global_batch_size // args.dp_degree + local_dp_microbatch = args.microbatch_size * args.ep_degree + if local_dp_batch_size % local_dp_microbatch != 0: + raise ValueError( + f"local DP batch size ({local_dp_batch_size}) must be divisible by " + f"microbatch-size * ep-degree " + f"({args.microbatch_size} * {args.ep_degree} = {local_dp_microbatch})." + ) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + dist.init_process_group("nccl", device_id=device) + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + (args.dp_degree, args.ep_degree), + mesh_dim_names=("dp", "ep"), + ) + return device, mesh + + +def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor: + coordinate = mesh.get_coordinate() + if coordinate is None: + raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.") + dp_rank, _ep_rank = coordinate + local_dp_batch_size = args.global_batch_size // args.dp_degree + + generator = torch.Generator(device="cpu") + generator.manual_seed(args.seed) + tokens = torch.randint( + 0, + vocab_size, + (args.global_batch_size, args.seq_len + 1), + generator=generator, + dtype=torch.long, + ) + + start = dp_rank * local_dp_batch_size + stop = start + local_dp_batch_size + return tokens[start:stop].to(device, non_blocking=True) + + +def vocab_parallel_cross_entropy( + logits: torch.Tensor, + labels: torch.Tensor, + *, + vocab_size: int, + vocab_group, + vocab_rank: int, + vocab_degree: int, + global_token_count: int, +) -> torch.Tensor: + if logits.shape[:2] != labels.shape: + raise ValueError( + f"logits shape {tuple(logits.shape)} is incompatible with " + f"labels shape {tuple(labels.shape)}." + ) + + local_vocab_size = logits.shape[-1] + vocab_start = vocab_rank * local_vocab_size + vocab_stop = vocab_start + local_vocab_size + if vocab_rank == vocab_degree - 1: + vocab_stop = vocab_size + + logits = logits.float() + local_max = logits.amax(dim=-1) + with torch.no_grad(): + global_max = local_max.detach().clone() + dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=vocab_group) + + shifted_logits = logits - global_max.unsqueeze(-1) + local_exp_sum = shifted_logits.exp().sum(dim=-1) + global_exp_sum = dist_nn_func.all_reduce( + local_exp_sum, + op=dist.ReduceOp.SUM, + group=vocab_group, + ) + + target_mask = (labels >= vocab_start) & (labels < vocab_stop) + local_target = torch.zeros_like(labels, dtype=torch.long) + local_target[target_mask] = labels[target_mask] - vocab_start + local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) + local_target_logits = local_target_logits * target_mask.to(logits.dtype) + target_logits = dist_nn_func.all_reduce( + local_target_logits, + op=dist.ReduceOp.SUM, + group=vocab_group, + ) + + loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() + return loss_sum / (global_token_count * vocab_degree) + + +def chunk_ranges(size: int, chunk_size: int): + if chunk_size <= 0: + yield 0, size + return + for start in range(0, size, chunk_size): + yield start, min(start + chunk_size, size) + + +def print_rank0(message: str) -> None: + if dist.get_rank() == 0: + print(message, flush=True) + + +def print_cuda_memory(stage: str, device: torch.device) -> None: + allocated = torch.cuda.memory_allocated(device) / 1024**3 + reserved = torch.cuda.memory_reserved(device) / 1024**3 + max_reserved = torch.cuda.max_memory_reserved(device) / 1024**3 + print_rank0( + f"{stage}: cuda allocated={allocated:.2f}GiB " + f"reserved={reserved:.2f}GiB max_reserved={max_reserved:.2f}GiB" + ) + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.DEBUG) + + device, mesh = init_distributed(args) + ep_group = mesh.get_group("ep") + ep_rank = mesh.get_local_rank("ep") + local_dp_batch_size = args.global_batch_size // args.dp_degree + local_dp_microbatch = args.microbatch_size * args.ep_degree + gradient_accumulation_steps = local_dp_batch_size // local_dp_microbatch + + torch.manual_seed(args.seed) + model_args = make_model_args(args.flavor, args.seq_len) + if args.seq_len is None: + args.seq_len = model_args.max_seq_len + if model_args.num_experts % args.ep_degree != 0: + raise ValueError( + f"num_experts ({model_args.num_experts}) must be divisible by " + f"ep-degree ({args.ep_degree})." + ) + trace_global_batch_size = args.microbatch_size * args.dp_degree * args.ep_degree + + with torch.device("meta"): + model = Transformer(model_args, mesh=mesh, moe_axis_name="ep") + + def input_fn(): + return torch.randint( + 0, + model_args.vocab_size, + (trace_global_batch_size, args.seq_len), + device=device, + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + + print_rank0( + f"Qwen3 {args.flavor} sanity check: " + f"mesh=(dp={args.dp_degree}, ep={args.ep_degree}), " + f"global_batch={args.global_batch_size}, " + f"local_dp_batch={local_dp_batch_size}, " + f"per_rank_microbatch={args.microbatch_size}, " + f"local_dp_microbatch={local_dp_microbatch}, " + f"grad_accum={gradient_accumulation_steps}, " + f"trace_global_batch={trace_global_batch_size}, " + f"seq_len={args.seq_len}, " + f"loss_chunk_size={args.loss_chunk_size}, " + f"optimizer={args.optimizer}" + ) + + t0 = time.time() + with AutoParallel( + model, + input_fn, + mesh, + mp_policy, + dynamic=True, + repeated_subgraphs=True, + ) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0), Shard(0))]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + sharding_placement = autop.optimize_placement(verbose=args.verbose) + parallel_mod = autop.apply_placement(sharding_placement) + + print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s") + print_cuda_memory("after AutoParallel", device) + + parallel_mod.to_empty(device=device) + print_cuda_memory("after to_empty", device) + parallel_mod.init_weights(buffer_device=device, seed=args.seed) # type: ignore[operator] + print_cuda_memory("after init_weights", device) + + if args.compile: + parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) + + batch = make_local_tokens(args, mesh, device, model_args.vocab_size) + inputs = batch[:, :-1].contiguous() + labels = batch[:, 1:].contiguous() + + ep_coordinate = mesh.get_coordinate()[1] + input_microbatches = [] + label_microbatches = [] + for start in range(0, local_dp_batch_size, local_dp_microbatch): + stop = start + local_dp_microbatch + input_block = inputs[start:stop] + input_start = ep_coordinate * args.microbatch_size + input_stop = input_start + args.microbatch_size + input_microbatches.append(input_block[input_start:input_stop].contiguous()) + label_microbatches.append(labels[start:stop].contiguous()) + + global_token_count = args.global_batch_size * args.seq_len + if args.optimizer == "adamw": + optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) + elif args.optimizer == "sgd": + optimizer = torch.optim.SGD(parallel_mod.parameters(), lr=args.lr) + else: + optimizer = None + + try: + losses: list[float] = [] + for step in range(args.train_steps): + if optimizer is not None: + optimizer.zero_grad(set_to_none=True) + else: + parallel_mod.zero_grad(set_to_none=True) + step_loss = torch.zeros((), device=device) + for micro_inputs, micro_labels in zip( + input_microbatches, label_microbatches + ): + logits = parallel_mod(micro_inputs) + + seq_ranges = list(chunk_ranges(logits.shape[1], args.loss_chunk_size)) + for chunk_idx, (seq_start, seq_stop) in enumerate(seq_ranges): + logits_chunk = logits[:, seq_start:seq_stop] + labels_chunk = micro_labels[:, seq_start:seq_stop] + loss = vocab_parallel_cross_entropy( + logits_chunk, + labels_chunk, + vocab_size=model_args.vocab_size, + vocab_group=ep_group, + vocab_rank=ep_rank, + vocab_degree=args.ep_degree, + global_token_count=global_token_count, + ) + if torch.any(torch.isnan(loss)): + raise RuntimeError("Found NaNs in Qwen3 MoE training loss.") + + retain_graph = chunk_idx != len(seq_ranges) - 1 + loss.backward(retain_graph=retain_graph) + step_loss = step_loss + loss.detach() + + torch.nn.utils.clip_grad_norm_( + parallel_mod.parameters(), args.max_grad_norm + ) + if optimizer is not None: + optimizer.step() + + with torch.no_grad(): + logged_loss = step_loss.clone() + dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM) + loss_value = float(logged_loss.item()) + losses.append(loss_value) + print_rank0(f"step={step:03d} loss={loss_value:.6f}") + print_cuda_memory(f"after step {step:03d}", device) + + if ( + not args.skip_loss_improvement_check + and len(losses) > 1 + and losses[-1] >= losses[0] + ): + raise RuntimeError( + f"Qwen3 MoE training loss did not improve: " + f"initial={losses[0]:.6f}, final={losses[-1]:.6f}" + ) + + if len(losses) > 1: + print_rank0( + f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}" + ) + dist.barrier(device_ids=[device.index]) + torch.cuda.synchronize(device) + finally: + if dist.is_initialized(): + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/examples/example_torchtitan_qwen3_dense.py b/examples/example_torchtitan_qwen3_dense.py new file mode 100644 index 00000000..a4685d1b --- /dev/null +++ b/examples/example_torchtitan_qwen3_dense.py @@ -0,0 +1,370 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import dataclasses +import logging +import os +import sys +import time +from pathlib import Path + +import torch +import torch.distributed as dist +import torch.distributed.nn.functional as dist_nn_func +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel.api import AutoParallel +from autoparallel.compile import autoparallel_backend + + +def _add_sibling_torchtitan_to_path() -> None: + repo_root = Path(__file__).resolve().parents[1] + torchtitan_root = repo_root.parent / "torchtitan" + if torchtitan_root.exists(): + sys.path.insert(0, str(torchtitan_root)) + + +_add_sibling_torchtitan_to_path() + +from torchtitan.models.qwen3 import Qwen3Model, qwen3_configs # noqa: E402 + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + "Run torchtitan's dense Qwen3 model through AutoParallel's " + "searched placement on real GPUs." + ) + ) + parser.add_argument( + "--flavor", + choices=("debugmodel", "debugmodel_fused_qkv", "0.6B", "1.7B", "4B", "8B"), + default="8B", + help="Dense torchtitan Qwen3 flavor.", + ) + parser.add_argument( + "--global-batch-size", + type=int, + default=4, + help="Global batch size across data-parallel ranks.", + ) + parser.add_argument( + "--microbatch-size", + type=int, + default=1, + help="Per-DP-rank microbatch size for gradient accumulation.", + ) + parser.add_argument( + "--seq-len", + type=int, + default=2048, + help="Sequence length for the real sanity run.", + ) + parser.add_argument( + "--dp-degree", + type=int, + default=2, + help="Data-parallel mesh degree.", + ) + parser.add_argument( + "--tp-degree", + type=int, + default=2, + help="Tensor-parallel mesh degree.", + ) + parser.add_argument( + "--train-steps", + type=int, + default=2, + help="Number of optimizer steps.", + ) + parser.add_argument( + "--lr", + type=float, + default=3e-4, + help="AdamW learning rate.", + ) + parser.add_argument( + "--max-grad-norm", + type=float, + default=1.0, + help="Gradient clipping max norm.", + ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="Seed for model initialization and synthetic data generation.", + ) + parser.add_argument( + "--compile", + action="store_true", + help="Compile the placed module with the AutoParallel backend before training.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print the full AutoParallel optimizer log.", + ) + return parser.parse_args() + + +def make_model_config(flavor: str, seq_len: int) -> Qwen3Model.Config: + config = qwen3_configs[flavor](attn_backend="sdpa") + config.rope = dataclasses.replace(config.rope, max_seq_len=seq_len) + return config + + +def init_distributed(args): + if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: + raise RuntimeError( + "Run this example with torchrun, e.g. " + "torchrun --standalone --nproc-per-node 4 " + "examples/example_torchtitan_qwen3_dense.py" + ) + + world_size = int(os.environ["WORLD_SIZE"]) + local_rank = int(os.environ["LOCAL_RANK"]) + expected_world_size = args.dp_degree * args.tp_degree + if world_size != expected_world_size: + raise ValueError( + f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree " + f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})." + ) + if args.global_batch_size % args.dp_degree != 0: + raise ValueError( + f"global-batch-size ({args.global_batch_size}) must be divisible by " + f"dp-degree ({args.dp_degree})." + ) + local_batch_size = args.global_batch_size // args.dp_degree + if local_batch_size % args.microbatch_size != 0: + raise ValueError( + f"local batch size ({local_batch_size}) must be divisible by " + f"microbatch-size ({args.microbatch_size})." + ) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + dist.init_process_group("nccl", device_id=device) + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + (args.dp_degree, args.tp_degree), + mesh_dim_names=("dp", "tp"), + ) + return device, mesh + + +def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor: + coordinate = mesh.get_coordinate() + if coordinate is None: + raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.") + dp_rank, _tp_rank = coordinate + local_batch_size = args.global_batch_size // args.dp_degree + + generator = torch.Generator(device="cpu") + generator.manual_seed(args.seed) + tokens = torch.randint( + 0, + vocab_size, + (args.global_batch_size, args.seq_len + 1), + generator=generator, + dtype=torch.long, + ) + + start = dp_rank * local_batch_size + stop = start + local_batch_size + return tokens[start:stop].to(device, non_blocking=True) + + +def vocab_parallel_cross_entropy( + logits: torch.Tensor, + labels: torch.Tensor, + *, + vocab_size: int, + tp_group, + tp_rank: int, + tp_degree: int, + global_token_count: int, +) -> torch.Tensor: + if logits.shape[:2] != labels.shape: + raise ValueError( + f"logits shape {tuple(logits.shape)} is incompatible with " + f"labels shape {tuple(labels.shape)}." + ) + + local_vocab_size = logits.shape[-1] + vocab_start = tp_rank * local_vocab_size + vocab_stop = vocab_start + local_vocab_size + if tp_rank == tp_degree - 1: + vocab_stop = vocab_size + + logits = logits.float() + local_max = logits.amax(dim=-1) + with torch.no_grad(): + global_max = local_max.detach().clone() + dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group) + + shifted_logits = logits - global_max.unsqueeze(-1) + local_exp_sum = shifted_logits.exp().sum(dim=-1) + global_exp_sum = dist_nn_func.all_reduce( + local_exp_sum, + op=dist.ReduceOp.SUM, + group=tp_group, + ) + + target_mask = (labels >= vocab_start) & (labels < vocab_stop) + local_target = torch.zeros_like(labels, dtype=torch.long) + local_target[target_mask] = labels[target_mask] - vocab_start + local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) + local_target_logits = local_target_logits * target_mask.to(logits.dtype) + target_logits = dist_nn_func.all_reduce( + local_target_logits, + op=dist.ReduceOp.SUM, + group=tp_group, + ) + + loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() + return loss_sum / (global_token_count * tp_degree) + + +def print_rank0(message: str) -> None: + if dist.get_rank() == 0: + print(message, flush=True) + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.DEBUG) + + device, mesh = init_distributed(args) + tp_group = mesh.get_group("tp") + tp_rank = mesh.get_local_rank("tp") + local_batch_size = args.global_batch_size // args.dp_degree + gradient_accumulation_steps = local_batch_size // args.microbatch_size + + torch.manual_seed(args.seed) + model_config = make_model_config(args.flavor, args.seq_len) + vocab_size = model_config.vocab_size + + with torch.device("meta"): + model = model_config.build() + + def input_fn(): + return torch.randint( + 0, + vocab_size, + (args.global_batch_size, args.seq_len), + device=device, + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + + print_rank0( + f"torchtitan Qwen3 {args.flavor} via AutoParallel: " + f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), " + f"global_batch={args.global_batch_size}, " + f"local_batch={local_batch_size}, " + f"microbatch={args.microbatch_size}, " + f"grad_accum={gradient_accumulation_steps}, " + f"seq_len={args.seq_len}" + ) + + t0 = time.time() + with AutoParallel( + model, + input_fn, + mesh, + mp_policy, + repeated_subgraphs=True, + ) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0), Replicate())]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + sharding_placement = autop.optimize_placement(verbose=args.verbose) + parallel_mod = autop.apply_placement(sharding_placement) + + print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s") + + parallel_mod.to_empty(device=device) + torch.manual_seed(args.seed) + parallel_mod.init_weights(buffer_device=device) # type: ignore[operator] + + if args.compile: + parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) + + batch = make_local_tokens(args, mesh, device, vocab_size) + inputs = batch[:, :-1].contiguous() + labels = batch[:, 1:].contiguous() + input_microbatches = torch.split(inputs, args.microbatch_size, dim=0) + label_microbatches = torch.split(labels, args.microbatch_size, dim=0) + + global_token_count = args.global_batch_size * args.seq_len + optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) + + try: + losses: list[float] = [] + for step in range(args.train_steps): + optimizer.zero_grad(set_to_none=True) + step_loss = torch.zeros((), device=device) + for micro_inputs, micro_labels in zip( + input_microbatches, label_microbatches + ): + logits = parallel_mod(micro_inputs) + if torch.any(torch.isnan(logits)): + raise RuntimeError("Found NaNs in forward output.") + + loss = vocab_parallel_cross_entropy( + logits, + micro_labels, + vocab_size=vocab_size, + tp_group=tp_group, + tp_rank=tp_rank, + tp_degree=args.tp_degree, + global_token_count=global_token_count, + ) + if torch.any(torch.isnan(loss)): + raise RuntimeError("Found NaNs in training loss.") + + loss.backward() + step_loss = step_loss + loss.detach() + + torch.nn.utils.clip_grad_norm_( + parallel_mod.parameters(), args.max_grad_norm + ) + optimizer.step() + + with torch.no_grad(): + logged_loss = step_loss.clone() + dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM) + loss_value = float(logged_loss.item()) + losses.append(loss_value) + print_rank0(f"step={step:03d} loss={loss_value:.6f}") + + if len(losses) > 1 and losses[-1] >= losses[0]: + raise RuntimeError( + f"Training loss did not improve: " + f"initial={losses[0]:.6f}, final={losses[-1]:.6f}" + ) + + if len(losses) > 1: + print_rank0( + f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}" + ) + else: + print_rank0(f"Completed one step: loss={losses[0]:.6f}") + dist.barrier(device_ids=[device.index]) + torch.cuda.synchronize(device) + finally: + if dist.is_initialized(): + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/tests/test_dsv3_torchtitan_config.py b/tests/test_dsv3_torchtitan_config.py new file mode 100644 index 00000000..e009206b --- /dev/null +++ b/tests/test_dsv3_torchtitan_config.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import sys +from pathlib import Path + +import pytest +import torch + +from autoparallel._testing.models.dsv3 import DeepSeekV3Model + + +def test_dsv3_accepts_torchtitan_grouped_experts_config(): + torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan" + if not torchtitan_root.exists(): + pytest.skip("torchtitan sibling checkout not found") + sys.path.insert(0, str(torchtitan_root)) + + try: + from torchtitan.models.deepseek_v3 import deepseekv3_configs # type: ignore[import-not-found] + except Exception as exc: + pytest.skip(f"torchtitan DeepSeek-V3 config unavailable: {exc}") + + with torch.device("meta"): + model = DeepSeekV3Model( + deepseekv3_configs["debugmodel"]( + attn_backend="sdpa", + moe_comm_backend="standard", + ) + ) + + moe_layer = next(layer for layer in model.layers.values() if layer.moe_enabled) + assert moe_layer.moe.experts.use_grouped_mm diff --git a/tests/test_qwen3.py b/tests/test_qwen3.py new file mode 100644 index 00000000..5b32bc5b --- /dev/null +++ b/tests/test_qwen3.py @@ -0,0 +1,323 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import sys +from pathlib import Path + +import pytest +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor import DTensor +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel._testing.models.qwen3 import ( + Qwen3ModelArgs, + Transformer, + apply_rotary_emb_cos_sin, + qwen3_debug_args, + qwen3_args_from_torchtitan_config, + qwen3_moe_debug_args, +) +from autoparallel.api import AutoParallel, auto_parallel + + +def _tiny_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=64, + n_layers=2, + n_heads=4, + n_kv_heads=2, + head_dim=16, + hidden_dim=128, + vocab_size=128, + max_seq_len=16, + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def _tiny_moe_args(**overrides) -> Qwen3ModelArgs: + args = Qwen3ModelArgs( + dim=32, + n_layers=1, + n_heads=4, + n_kv_heads=2, + head_dim=8, + hidden_dim=64, + vocab_size=64, + max_seq_len=4, + moe_enabled=True, + moe_hidden_dim=16, + num_experts=64, + top_k=8, + route_norm=True, + score_before_experts=False, + moe_axis_name="tp", + ) + for key, value in overrides.items(): + setattr(args, key, value) + args.__post_init__() + return args + + +def test_qwen3_forward_shape(): + args = _tiny_args() + model = Transformer(args) + model.init_weights(seed=0) + + tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len)) + logits = model(tokens) + + assert logits.shape == (2, args.max_seq_len, args.vocab_size) + + +def test_qwen3_qk_norm_changes_logits(): + args = _tiny_args(n_layers=1) + model = Transformer(args) + model.init_weights(seed=0) + + tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len)) + logits = model(tokens) + + with torch.no_grad(): + model.layers["0"].attention.q_norm.weight.zero_() + logits_without_q = model(tokens) + + assert not torch.allclose(logits, logits_without_q) + + +def test_qwen3_weight_tying_survives_init_weights(): + args = _tiny_args(enable_weight_tying=True) + model = Transformer(args) + + assert model.tok_embeddings.weight is model.lm_head.weight + model.init_weights(seed=0) + assert model.tok_embeddings.weight is model.lm_head.weight + + +def test_qwen3_debug_args_matches_torchtitan_dense_shape(): + args = qwen3_debug_args(max_seq_len=32) + + assert args.dim == 256 + assert args.n_layers == 8 + assert args.n_heads == 16 + assert args.n_kv_heads == 8 + assert args.head_dim == 128 + assert args.hidden_dim == 3072 + assert args.vocab_size == 2048 + assert args.rope_theta == 1000000.0 + assert args.enable_weight_tying + + +def test_qwen3_moe_debug_args_matches_torchtitan_shape(): + args = qwen3_moe_debug_args(max_seq_len=32) + + assert args.dim == 256 + assert args.n_layers == 8 + assert args.n_heads == 16 + assert args.n_kv_heads == 8 + assert args.head_dim == 128 + assert args.moe_enabled + assert args.moe_hidden_dim == 768 + assert args.num_experts == 64 + assert args.top_k == 8 + assert args.route_norm + assert not args.score_before_experts + + +@pytest.mark.parametrize( + ("flavor", "expected"), + [ + ( + "8B", + { + "dim": 4096, + "n_layers": 36, + "n_heads": 32, + "n_kv_heads": 8, + "head_dim": 128, + "hidden_dim": 12288, + "vocab_size": 151936, + "moe_enabled": False, + "num_experts": 0, + "top_k": 1, + "max_seq_len": 4096, + }, + ), + ( + "30B-A3B", + { + "dim": 2048, + "n_layers": 48, + "n_heads": 32, + "n_kv_heads": 4, + "head_dim": 128, + "hidden_dim": 0, + "vocab_size": 151936, + "moe_enabled": True, + "moe_hidden_dim": 768, + "num_experts": 128, + "top_k": 8, + "route_norm": True, + "score_before_experts": False, + "max_seq_len": 262144, + }, + ), + ], +) +def test_qwen3_args_from_torchtitan_config(flavor, expected): + torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan" + if not torchtitan_root.exists(): + pytest.skip("torchtitan sibling checkout not found") + sys.path.insert(0, str(torchtitan_root)) + + try: + from torchtitan.models.qwen3 import qwen3_configs # type: ignore[import-not-found] + except Exception as exc: + pytest.skip(f"torchtitan Qwen3 config unavailable: {exc}") + + args = qwen3_args_from_torchtitan_config( + qwen3_configs[flavor](attn_backend="sdpa") + ) + + for attr, value in expected.items(): + assert getattr(args, attr) == value + assert args.rope_theta == 1000000.0 + assert args.norm_eps == 1e-6 + + +def test_qwen3_cos_sin_rope_matches_torchtitan_helper(): + torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan" + if not torchtitan_root.exists(): + pytest.skip("torchtitan sibling checkout not found") + sys.path.insert(0, str(torchtitan_root)) + + try: + from torchtitan.models.common.rope import ( # type: ignore[import-not-found] + RoPE, + apply_rotary_emb_cos_sin as tt_apply_rotary_emb_cos_sin, + ) + except Exception as exc: + pytest.skip(f"torchtitan Qwen3 RoPE helper unavailable: {exc}") + + args = _tiny_args() + rope = RoPE( + RoPE.Config( + dim=args.head_dim, + max_seq_len=args.max_seq_len, + theta=args.rope_theta, + backend="cos_sin", + ) + ) + xq = torch.randn(2, args.max_seq_len, args.n_heads, args.head_dim) + xk = torch.randn(2, args.max_seq_len, args.n_kv_heads, args.head_dim) + + actual = apply_rotary_emb_cos_sin(xq, xk, rope.cache) + expected = tt_apply_rotary_emb_cos_sin(xq, xk, rope.cache) + + torch.testing.assert_close(actual[0], expected[0]) + torch.testing.assert_close(actual[1], expected[1]) + + +def test_qwen3_autoparallel_pipeline_smoke(device_mesh_2d): + args = _tiny_args(n_layers=2, max_seq_len=8) + batch_size = 2 * device_mesh_2d.shape[0] + + with torch.device("meta"): + model = Transformer(args) + + def input_fn(): + return torch.randint( + 0, + args.vocab_size, + (batch_size, args.max_seq_len), + device="cuda", + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + + with AutoParallel( + model, + input_fn, + device_mesh_2d, + mp_policy, + repeated_subgraphs=True, + ) as autop: + autop.add_input_constraints([(Shard(0), Replicate())]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + sharding_placement = autop.optimize_placement(verbose=False) + parallel_mod = autop.apply_placement(sharding_placement) + + assert isinstance(parallel_mod, Transformer) + + +def test_qwen3_moe_auto_parallel_smoke(device_mesh_2d): + args = _tiny_moe_args() + local_batch_size = 1 + + with torch.device("meta"): + model = Transformer(args, mesh=device_mesh_2d, moe_axis_name="tp") + + expected_param_shapes = { + name: tuple(param.shape) for name, param in model.named_parameters() + } + expected_nparams = sum(param.numel() for param in model.parameters()) + + tokens = DTensor.from_local( + torch.randint( + 0, + args.vocab_size, + (local_batch_size, args.max_seq_len), + device="cuda", + ), + device_mesh_2d, + [Shard(0), Shard(0)], + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + parallel_mod = auto_parallel( + model, + device_mesh_2d, + sample_inputs=(tokens,), + out_shardings=(Shard(0), Shard(2)), + mp_policy=mp_policy, + dynamic=True, + ) + + assert isinstance(parallel_mod, Transformer) + assert sum(param.numel() for param in parallel_mod.parameters()) == expected_nparams + assert { + name: tuple(param.shape) for name, param in parallel_mod.named_parameters() + } == expected_param_shapes + assert parallel_mod.layers["0"].moe.experts.w1.shape == ( + args.num_experts, + args.moe_hidden_dim, + args.dim, + ) + + parallel_mod.to_empty(device="cuda") + parallel_mod.init_weights(buffer_device=torch.device("cuda"), seed=0) + + local_tokens = torch.randint( + 0, + args.vocab_size, + (local_batch_size, args.max_seq_len), + device="cuda", + ) + out = parallel_mod(local_tokens) + assert out.shape == ( + local_batch_size * device_mesh_2d.shape[1], + args.max_seq_len, + args.vocab_size // device_mesh_2d.shape[1], + ) + out.backward(torch.randn_like(out)) From b02ac054facca0aaaaa70b380f550bd3d2f85188 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Thu, 28 May 2026 09:47:51 -0700 Subject: [PATCH 02/27] Add sharding optimizer profiling snapshot Record optimizer setup and solve profiling in ShardingOptimizer, add a contributor pipeline document, and include the profiling result artifacts used to inspect LLaMA and Qwen behavior. Authored with Claude. --- autoparallel/optimize_sharding.py | 348 +- docs/codebase_pipeline.md | 593 + examples/example_llama3.py | 7 +- ...ama3_3b_ilp_node_indegree_distribution.svg | 51 + .../llama3_8b_4x4_strategy_full.json | 287470 +++++++++++++++ .../llama3_8b_4x4_strategy_summary.json | 2054 + .../real_llama3_3b_dag_node_stats.csv | 7200 + .../real_llama3_3b_dag_summary.json | 883 + .../real_llama3_3b_merge_points.csv | 1668 + profile_results/real_llama3_by_mesh_dim.svg | 167 + profile_results/real_llama3_by_model_size.svg | 177 + profile_results/real_llama3_dag_analysis.py | 255 + .../real_llama3_optimizer_presolve_3d4d.log | 7 + .../real_llama3_optimizer_sweep.csv | 9 + .../real_llama3_optimizer_sweep.jsonl | 8 + .../real_llama3_optimizer_sweep.log | 54 + .../real_llama3_optimizer_sweep.py | 351 + .../real_llama3_partial_presolve.csv | 3 + profile_results/real_llama3_timeouts.csv | 3 + pyproject.toml | 6 + qwen3_8b_autoparallel_30steps.log | 1 + qwen3_8b_autoparallel_30steps_loss_curve.png | 1 + qwen3_8b_autoparallel_30steps_loss_curve.svg | 1 + qwen3_8b_autoparallel_30steps_losses.csv | 1 + qwen3_moe_mast_20steps_loss_curve.png | Bin 0 -> 19666 bytes qwen3_moe_mast_20steps_loss_curve.svg | 68 + qwen3_moe_mast_20steps_losses.csv | 21 + 27 files changed, 301396 insertions(+), 11 deletions(-) create mode 100644 docs/codebase_pipeline.md create mode 100644 profile_results/llama3_3b_ilp_node_indegree_distribution.svg create mode 100644 profile_results/llama3_8b_4x4_strategy_full.json create mode 100644 profile_results/llama3_8b_4x4_strategy_summary.json create mode 100644 profile_results/real_llama3_3b_dag_node_stats.csv create mode 100644 profile_results/real_llama3_3b_dag_summary.json create mode 100644 profile_results/real_llama3_3b_merge_points.csv create mode 100644 profile_results/real_llama3_by_mesh_dim.svg create mode 100644 profile_results/real_llama3_by_model_size.svg create mode 100644 profile_results/real_llama3_dag_analysis.py create mode 100644 profile_results/real_llama3_optimizer_presolve_3d4d.log create mode 100644 profile_results/real_llama3_optimizer_sweep.csv create mode 100644 profile_results/real_llama3_optimizer_sweep.jsonl create mode 100644 profile_results/real_llama3_optimizer_sweep.log create mode 100644 profile_results/real_llama3_optimizer_sweep.py create mode 100644 profile_results/real_llama3_partial_presolve.csv create mode 100644 profile_results/real_llama3_timeouts.csv create mode 120000 qwen3_8b_autoparallel_30steps.log create mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.png create mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.svg create mode 120000 qwen3_8b_autoparallel_30steps_losses.csv create mode 100644 qwen3_moe_mast_20steps_loss_curve.png create mode 100644 qwen3_moe_mast_20steps_loss_curve.svg create mode 100644 qwen3_moe_mast_20steps_losses.csv diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 9692ef2f..2b1909ee 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -241,14 +241,36 @@ def __init__( ) self._constraint_log: list[tuple[str, dict]] = [] self._name_counters: dict[str, int] = {} + self.profile: dict[str, Any] = { + "mesh": self._profile_mesh(), + "model": self._profile_model(), + "timings": {}, + } + t_init_start = time.perf_counter() t0 = time.perf_counter() self.strats = self.build_sharding_metadata() + t_strategy = time.perf_counter() - t0 + self.profile["timings"]["strategy_enumeration_s"] = t_strategy + self.profile["strategies"] = self._profile_strategies() + logger.info( + "ShardingOptimizer phase profile: phase=strategy_enumeration " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s " + "graph_nodes=%s strategy_options=%s option_tuples=%s elapsed=%.3fs", + self.profile["mesh"]["shape"], + self.profile["mesh"]["dim_names"], + self.profile["mesh"]["size"], + self._format_billions(self.profile["model"]["parameter_numel"]), + self.profile["model"]["graph_nodes"], + self.profile["strategies"]["strategy_options"], + self.profile["strategies"]["option_tuples"], + t_strategy, + ) # nodes/node_map are derived from strats (not graph.nodes) so that # shape-computation nodes skipped by build_sharding_metadata don't # appear and indices stay consistent. self.nodes = list(self.strats.keys()) self.node_map = {node: i for i, node in enumerate(self.nodes)} - logger.debug("Placement options took %.3fs", time.perf_counter() - t0) + logger.debug("Placement options took %.3fs", t_strategy) from autoparallel.shardings.placement_options import get_placement_options_timer get_placement_options_timer().report() @@ -263,13 +285,77 @@ def __init__( t0 = time.perf_counter() self.decision_vars = self._build_decision_vars() t1 = time.perf_counter() + logger.info( + "ShardingOptimizer phase profile: phase=decision_vars " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s " + "unique_ilp_vars=%s logical_decision_vars=%s " + "cluster_copied_decision_vars=%s pulp_var_creation=%.3fs " + "compute_cost=%.3fs edge_cost=%.3fs cost_estimation=%.3fs " + "elapsed=%.3fs", + self.profile["mesh"]["shape"], + self.profile["mesh"]["dim_names"], + self.profile["mesh"]["size"], + self._format_billions(self.profile["model"]["parameter_numel"]), + self._decision_var_profile["unique_pulp_variables"], + self._decision_var_profile["logical_decision_variables"], + self._decision_var_profile["cluster_copied_decision_variables"], + self._decision_var_profile["pulp_var_creation_s"], + self._decision_var_profile["compute_cost_estimation_s"], + self._decision_var_profile["edge_cost_estimation_s"], + self._decision_var_profile["cost_estimation_s"], + t1 - t0, + ) self.validate() t2 = time.perf_counter() self.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize) self.add_default_constraints() t3 = time.perf_counter() + decision_var_build_s = t1 - t0 + cost_estimation_s = self._decision_var_profile["cost_estimation_s"] + decision_var_overhead_s = max( + decision_var_build_s + - self._decision_var_profile["pulp_var_creation_s"] + - cost_estimation_s, + 0.0, + ) + self.profile["timings"].update( + { + "decision_var_build_s": decision_var_build_s, + "decision_var_overhead_s": decision_var_overhead_s, + "validation_s": t2 - t1, + "constraint_construction_s": t3 - t2, + "ilp_construction_s": ( + self._decision_var_profile["pulp_var_creation_s"] + + decision_var_overhead_s + + (t3 - t2) + ), + "init_total_s": t3 - t_init_start, + } + ) n_unique_vars = len(self.pulp_variables) n_constraints = len(self.prob.constraints) + self.profile["ilp"] = { + "unique_variables": n_unique_vars, + "logical_decision_variables": self._decision_var_profile[ + "logical_decision_variables" + ], + "cluster_copied_decision_variables": self._decision_var_profile[ + "cluster_copied_decision_variables" + ], + "constraints": n_constraints, + } + logger.info( + "ShardingOptimizer phase profile: phase=constraints " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s " + "unique_ilp_vars=%s constraints=%s elapsed=%.3fs", + self.profile["mesh"]["shape"], + self.profile["mesh"]["dim_names"], + self.profile["mesh"]["size"], + self._format_billions(self.profile["model"]["parameter_numel"]), + n_unique_vars, + n_constraints, + t3 - t2, + ) logger.debug( "ILP construction took %.3fs " "(decision_vars=%.3fs, validate=%.3fs, constraints=%.3fs)", @@ -284,6 +370,157 @@ def __init__( len(self.decision_vars), n_constraints, ) + self._log_init_profile() + + def _profile_mesh(self): + try: + mesh_shape = tuple(int(d) for d in self.mesh.shape) + except Exception: + mesh_shape = tuple() + try: + mesh_size = int(self.mesh.size()) + except Exception: + mesh_size = math.prod(mesh_shape) if mesh_shape else None + return { + "ndim": getattr(self.mesh, "ndim", len(mesh_shape)), + "shape": mesh_shape, + "dim_names": getattr(self.mesh, "mesh_dim_names", None), + "size": mesh_size, + } + + def _profile_model(self): + graph_nodes = list(self.graph.nodes) + op_counts = defaultdict(int) + tensor_nodes = 0 + for node in graph_nodes: + op_counts[node.op] += 1 + if _produces_tensor(node.meta.get("val")): + tensor_nodes += 1 + + param_numel = 0 + param_bytes = 0 + unknown_param_nodes = 0 + try: + param_nodes = get_param_nodes(self.graph) + except Exception: + param_nodes = [] + unknown_param_nodes = None + + for node in param_nodes: + val = node.meta.get("val") + if not isinstance(val, torch.Tensor): + unknown_param_nodes += 1 + continue + numel = self._safe_tensor_numel(val) + if numel is None: + unknown_param_nodes += 1 + continue + param_numel += numel + try: + param_bytes += numel * val.element_size() + except Exception: + pass + + return { + "graph_nodes": len(graph_nodes), + "tensor_nodes": tensor_nodes, + "op_counts": dict(op_counts), + "parameter_nodes": len(param_nodes), + "parameter_numel": param_numel, + "parameter_bytes": param_bytes, + "unknown_parameter_nodes": unknown_param_nodes, + } + + @staticmethod + def _safe_tensor_numel(tensor): + try: + numel = tensor.numel() + if isinstance(numel, int): + return numel + return int(numel) + except Exception: + pass + + shape = getattr(tensor, "shape", None) + if shape is None: + return None + + total = 1 + for dim in shape: + dim = concretize_symint(dim) + if not isinstance(dim, int): + return None + total *= dim + return total + + def _profile_strategies(self): + strategy_options = 0 + option_tuples = 0 + max_strategies_per_node = 0 + for node in self.strats: + if node.op == "output" or not hasattr(self.strats[node], "strategies"): + continue + strategies = self.strats[node].strategies + strategy_options += len(strategies) + max_strategies_per_node = max(max_strategies_per_node, len(strategies)) + option_tuples += sum(1 for _ in self.walk_over_options(node)) + return { + "nodes": len(self.strats), + "strategy_options": strategy_options, + "option_tuples": option_tuples, + "max_strategies_per_node": max_strategies_per_node, + } + + @staticmethod + def _format_billions(count): + if count is None: + return "unknown" + if count >= 1_000_000_000: + return f"{count / 1_000_000_000:.2f}B" + if count >= 1_000_000: + return f"{count / 1_000_000:.2f}M" + return str(count) + + @staticmethod + def _safe_float(value): + try: + return float(value) + except Exception: + return math.nan + + def _log_init_profile(self): + mesh = self.profile["mesh"] + model = self.profile["model"] + strategies = self.profile["strategies"] + ilp = self.profile["ilp"] + timings = self.profile["timings"] + logger.info( + "ShardingOptimizer init profile: " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s " + "model_params=%s param_nodes=%s graph_nodes=%s tensor_nodes=%s " + "strategy_options=%s option_tuples=%s " + "unique_ilp_vars=%s logical_decision_vars=%s constraints=%s " + "timings={strategy_enumeration=%.3fs,cost_estimation=%.3fs," + "ilp_construction=%.3fs,validation=%.3fs,total=%.3fs}", + mesh["shape"], + mesh["dim_names"], + mesh["size"], + self._format_billions(model["parameter_numel"]), + model["parameter_nodes"], + model["graph_nodes"], + model["tensor_nodes"], + strategies["strategy_options"], + strategies["option_tuples"], + ilp["unique_variables"], + ilp["logical_decision_variables"], + ilp["constraints"], + timings["strategy_enumeration_s"], + timings["cost_estimation_s"], + timings["ilp_construction_s"], + timings["validation_s"], + timings["init_total_s"], + ) + logger.debug("ShardingOptimizer init profile detail: %s", self.profile) def _get_next_name(self, prefix): idx = self._name_counters.setdefault(prefix, 0) @@ -580,6 +817,23 @@ def _build_decision_vars(self): t_compute, t_edge, ) + self._decision_var_profile = { + "logical_decision_variables": n_vars, + "cluster_copied_decision_variables": n_cluster_copied, + "unique_pulp_variables": len(self.pulp_variables), + "pulp_var_creation_s": t_pulp_end - t_pulp_start, + "compute_cost_estimation_s": t_compute, + "edge_cost_estimation_s": t_edge, + "cost_estimation_s": t_compute + t_edge, + } + self.profile["timings"].update( + { + "pulp_var_creation_s": t_pulp_end - t_pulp_start, + "compute_cost_estimation_s": t_compute, + "edge_cost_estimation_s": t_edge, + "cost_estimation_s": t_compute + t_edge, + } + ) return decision_vars def _resolve_decision_var(self, key): @@ -884,9 +1138,11 @@ def _solve(self, verbose=False): # Use a dedicated temp directory for PuLP's intermediate files (.mps, # .sol, etc.) so they are always cleaned up, even if the process is # killed. Without this, leftover files can fill up /tmp (tmpfs). + t0 = time.perf_counter() with tempfile.TemporaryDirectory() as tmpdir: solver.tmpDir = tmpdir self.prob.solve(solver) + solve_s = time.perf_counter() - t0 self.selected_keys = [ key for key, dv in self.decision_vars.items() if dv.var.value() == 1 @@ -904,6 +1160,60 @@ def _solve(self, verbose=False): "constraints, and consider relaxing input/output constraints or " "using a larger mesh." ) + return solve_s + + def _log_solve_profile( + self, + solve_kind, + objective_value, + objective_s, + solve_s, + extract_s, + total_s, + ): + mesh = self.profile["mesh"] + model = self.profile["model"] + timings = self.profile["timings"] + status = pulp.LpStatus.get(self.prob.status, self.prob.status) + pipeline_total_s = timings["init_total_s"] + total_s + logger.info( + "ShardingOptimizer %s profile: " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s " + "unique_ilp_vars=%s constraints=%s status=%s objective=%.4f " + "timings={strategy_enumeration=%.3fs,cost_estimation=%.3fs," + "ilp_construction=%.3fs,objective=%.3fs,solve=%.3fs," + "extract=%.3fs,total_solve_call=%.3fs,total_pipeline=%.3fs}", + solve_kind, + mesh["shape"], + mesh["dim_names"], + mesh["size"], + self._format_billions(model["parameter_numel"]), + len(self.pulp_variables), + len(self.prob.constraints), + status, + objective_value, + timings["strategy_enumeration_s"], + timings["cost_estimation_s"], + timings["ilp_construction_s"], + objective_s, + solve_s, + extract_s, + total_s, + pipeline_total_s, + ) + self.profile["last_solve"] = { + "kind": solve_kind, + "objective": objective_value, + "status": status, + "constraints": len(self.prob.constraints), + "unique_variables": len(self.pulp_variables), + "objective_s": objective_s, + "solve_s": solve_s, + "extract_s": extract_s, + "total_s": total_s, + "pipeline_total_s": pipeline_total_s, + } + logger.debug("ShardingOptimizer solve profile detail: %s", self.profile) def _extract_and_validate_solution(self): """Validate the ILP solution and return the optimal strategy per node.""" @@ -948,13 +1258,26 @@ def _to_concrete_solution(self, solution): def get_solution(self, verbose=False): t0 = time.perf_counter() + t_objective0 = time.perf_counter() self._set_objective() - self._solve(verbose) - obj_value = pulp.value(self.prob.objective) + t_objective1 = time.perf_counter() + solve_s = self._solve(verbose) + obj_value = self._safe_float(pulp.value(self.prob.objective)) + t_extract0 = time.perf_counter() + solution = self._to_orig_solution(self._extract_and_validate_solution()) + t_extract1 = time.perf_counter() logger.debug( "ILP solve took %.3fs (objective=%.4f)", time.perf_counter() - t0, obj_value ) - return self._to_orig_solution(self._extract_and_validate_solution()) + self._log_solve_profile( + "solve", + obj_value, + t_objective1 - t_objective0, + solve_s, + t_extract1 - t_extract0, + t_extract1 - t0, + ) + return solution def resolve(self, verbose=False): """Re-solve the ILP after adding or removing constraints. @@ -963,14 +1286,25 @@ def resolve(self, verbose=False): be called multiple times after modifying constraints. """ t0 = time.perf_counter() - self._solve(verbose) - obj_value = pulp.value(self.prob.objective) + solve_s = self._solve(verbose) + obj_value = self._safe_float(pulp.value(self.prob.objective)) + t_extract0 = time.perf_counter() + solution = self._to_orig_solution(self._extract_and_validate_solution()) + t_extract1 = time.perf_counter() logger.debug( "ILP re-solve took %.3fs (objective=%.4f)", time.perf_counter() - t0, obj_value, ) - return self._to_orig_solution(self._extract_and_validate_solution()) + self._log_solve_profile( + "re-solve", + obj_value, + 0.0, + solve_s, + t_extract1 - t_extract0, + t_extract1 - t0, + ) + return solution def remove_constraints(self, names): """Remove constraints by name, allowing re-solve to revert to the diff --git a/docs/codebase_pipeline.md b/docs/codebase_pipeline.md new file mode 100644 index 00000000..533c4c09 --- /dev/null +++ b/docs/codebase_pipeline.md @@ -0,0 +1,593 @@ +# AutoParallel Codebase Pipeline + +This document is a code-oriented guide for new contributors. It explains the +main pipeline, the important modules, and how data moves from a user model to a +parallelized module. + +AutoParallel is experimental and tightly coupled to PyTorch internals such as +FX, Dynamo export, AOTAutograd, DTensor, and Inductor. The best mental model is: + +```text +user model + -> fake/global tracing + -> joint forward/backward FX graph + -> per-node sharding strategy enumeration + -> ILP optimization + -> graph lowering with redistributions + -> parallel nn.Module with sharded params/buffers + -> optional torch.compile backend passes +``` + +## Public Entry Points + +The public API is exported from `autoparallel/__init__.py`: + +- `auto_parallel(...)`: simple wrapper for common usage. +- `AutoParallel(...)`: context-manager API for debugging and custom constraints. +- `autoparallel_backend(...)`: `torch.compile` backend wrapper for activation + checkpointing and communication/compute overlap passes. +- `with_sharding_constraint(...)`: model-level constraint helper. + +The main implementation lives in `autoparallel/api.py`. + +## End-to-End Pipeline + +### 1. User Defines Model, Mesh, and Example Inputs + +Users provide: + +- an `nn.Module`, often built on the `meta` device, +- a PyTorch `DeviceMesh`, +- example inputs, +- output placement constraints, +- optionally mixed precision and parameter memory constraints. + +The simple API accepts real tensors or DTensors as `sample_inputs`. DTensor +inputs are important because their placements become input constraints. Regular +tensors are treated as replicated on every mesh dimension. + +Relevant files: + +- `autoparallel/api.py` +- `autoparallel/input_validation.py` +- `docs/api_walkthrough.md` +- `examples/example_autoparallel.py` +- `examples/example_hf.py` + +### 2. Input Metadata Is Normalized + +In `auto_parallel(...)`, sample inputs are converted into metadata: + +- global shapes, +- dtypes, +- devices, +- input placement tuples, +- pytree structure. + +This is handled by `_extract_input_info(...)` and `_make_input_fn(...)` in +`autoparallel/input_validation.py`. + +The generated `input_fn()` creates fresh tensors with the same global metadata. +It is called later inside `FakeTensorMode`, so the tensors become fake tensors +instead of real allocations. + +### 3. AutoParallel Context Setup + +`AutoParallel.__init__` prepares the optimization environment: + +- deep-copies the user model so tracing and dtype wrappers do not mutate it, +- canonicalizes and applies mixed precision wrappers if requested, +- moves meta parameters and buffers into fake tensors on the mesh device, +- stores the mesh, cost model, and dynamic-shape setting, +- optionally creates a `ShapeEnv` for symbolic shapes. + +`AutoParallel.__enter__` then: + +- configures the NCCL topology cost model, +- enters the `DeviceMesh` context, +- traces the model into a joint graph, +- disables Inductor comprehensive padding while AutoParallel is active, +- constructs a `ShardingOptimizer`. + +Relevant files: + +- `autoparallel/api.py` +- `autoparallel/tracing.py` +- `autoparallel/cast_parametrization.py` +- `autoparallel/cost_models/nccl_cost_model.py` +- `autoparallel/cost_models/collective_runtime_estimation.py` + +### 4. Model Is Traced Into a Joint FX Graph + +Tracing happens in `build_joint_graph(...)` in `autoparallel/api.py`. + +The flow is: + +1. Call `input_fn()` under `FakeTensorMode`. +2. Optionally convert fake inputs to symbolic dynamic inputs. +3. Capture a forward graph with Dynamo export. +4. Restore model state after capture. +5. Add unused params and buffers so they still appear in the parameter specs. +6. Use AOTAutograd to export a joint forward/backward graph. +7. Clean up and normalize the graph. +8. Optionally replace `view -> mm -> view` patterns with `einsum`. +9. Add alias nodes to expose more optimization opportunities. + +The resulting graph is a single FX graph containing forward computation, +backward computation, parameter nodes, gradients, tangents, and outputs. +AutoParallel optimizes this joint graph rather than optimizing only the forward +path. + +Relevant files: + +- `autoparallel/api.py` +- `autoparallel/tracing.py` +- `autoparallel/graph_passes/graph_utils.py` +- `autoparallel/graph_passes/extract_forward.py` + +## Sharding Strategy Generation + +### 5. The Optimizer Builds Placement Options + +`ShardingOptimizer` is implemented in `autoparallel/optimize_sharding.py`. + +It first creates a concrete copy of the graph with symbolic dimensions replaced +by their hinted concrete values. The optimizer uses this concrete graph for +strategy enumeration, cost estimation, graph clustering, and ILP construction. +The original graph is kept for `apply_sharding`, which may still need symbolic +shape metadata. + +For each tensor-producing node, `build_sharding_metadata()` creates an +`OpStrategy`. An `OpStrategy` is a list of possible `OpSpec` choices. Each +`OpSpec` describes: + +- expected input DTensor specs, +- produced output DTensor specs, +- redistribution costs from predecessor placements. + +Placeholders and parameters start with all valid placements generated by +`_create_all_options(...)`. Call-function nodes get strategies from +`get_placement_options_for_node(...)`. + +Relevant files: + +- `autoparallel/optimize_sharding.py` +- `autoparallel/shardings/placement_options.py` +- `autoparallel/shardings/propagation_rules.py` + +### 6. Placement Rules Come From DTensor Plus AutoParallel Overrides + +`autoparallel/shardings/placement_options.py` dispatches strategy generation. + +For normal ops: + +- if AutoParallel has a custom rule in `_op_rules`, it uses that, +- otherwise it asks PyTorch DTensor for an op strategy through helper wrappers. + +AutoParallel adds custom rules in `autoparallel/shardings/propagation_rules.py`. +These rules cover cases where the default DTensor propagation is missing, +too strict, or not shaped for AutoParallel's optimizer. + +Important examples: + +- view and reshape-like ops, +- `operator.getitem`, +- pointwise behavior, +- tensor factory ops, +- matmul/einsum behavior, +- local-map and MoE-related higher-order ops, +- flex attention higher-order ops. + +After strategies are generated, AutoParallel: + +- propagates tensor metadata, +- fills missing redistribution costs, +- removes invalid shardings where tensor dimensions are too small for the mesh, +- deduplicates equivalent configurations, +- caches repeated placement-option lookups. + +## Cost Model + +### 7. Compute Cost + +Compute cost is estimated in `autoparallel/cost_models/compute_estimation.py`. + +The broad idea is: + +- count FLOPs when possible, +- estimate memory read/write time, +- estimate compute time from device throughput, +- use the max of memory time and compute time, +- apply a small launch floor for tiny kernels, +- treat pure view-like shape operations as cheap or free. + +The module contains hardware limit tables for several GPU families and a flop +counter extension for `einsum`. + +### 8. Communication Cost + +Communication cost is estimated in +`autoparallel/cost_models/collective_runtime_estimation.py`. + +The key transition types are: + +- `Shard -> Replicate`: all-gather, +- `Partial -> Replicate`: all-reduce, +- `Partial -> Shard`: reduce-scatter, +- `Shard(dim_a) -> Shard(dim_b)`: all-to-all, +- `Replicate -> Shard`: local narrowing, usually no collective. + +By default, `AutoParallel.__enter__` detects an NCCL topology config and the +cost model dispatches to `autoparallel/cost_models/nccl_cost_model.py`. This is +important because intra-node and inter-node collectives have very different +costs. + +Redistribution cost also includes penalties for non-contiguous layouts and +non-dim-0 shard reshuffling, because those cases need extra memory movement. + +### 9. Transition Cost + +The optimizer also adds a small sharding-transition penalty when a producer and +consumer use different placements. This is a tie-breaker that encourages +placement stability when communication and compute costs are otherwise similar. + +## ILP Optimization + +### 10. Decision Variables + +The ILP is built in `ShardingOptimizer`. + +A decision variable represents: + +```text +(node, argument index, output strategy index, producer input strategy index) +``` + +Each variable has: + +- total cost, +- compute cost, +- communication cost, +- transition cost, +- selected `OpSpec`, +- input and output DTensor specs. + +For repeated subgraphs, graph clustering can link equivalent decision variables +so the ILP is smaller. + +Relevant files: + +- `autoparallel/optimize_sharding.py` +- `autoparallel/graph_passes/graph_clustering.py` + +### 11. Default Constraints + +The optimizer adds these constraints before solving: + +- uniqueness: each node argument selects exactly one choice, +- same-output consistency: all tensor arguments of a multi-input op agree on + one output strategy, +- flow consistency: producer output placement matches consumer input placement, +- invalid-cost constraints: impossible configurations cannot be selected, +- forward/backward consistency constraints, +- gradient-reduce dtype constraints. + +User-facing constraints are layered on top: + +- `add_input_constraints(...)`, +- `add_output_constraints(...)`, +- `add_parameter_memory_constraint(...)`, +- node constraints through optimizer helpers, +- model-embedded `with_sharding_constraint(...)`. + +### 12. Solving + +`get_solution(...)` sets the objective and solves the ILP with PuLP's CBC +solver. The objective minimizes total estimated runtime cost across the joint +graph: + +```text +compute cost + communication cost + transition cost +``` + +The result is a mapping: + +```text +FX node -> chosen OpSpec +``` + +Public debugging helpers include: + +- `get_log(...)`, +- `print_costs_for_node(...)`, +- `explain_placement(...)`, +- `diff_solutions(...)`, +- `save(...)` and `load(...)`, +- `save_placements(...)` and `load_placements(...)`, +- `get_json(...)`. + +## Applying the Solution + +### 13. Lowering the Graph to Local Execution + +`apply_placement(...)` calls `apply_sharding_to_model(...)` in +`autoparallel/apply_sharding.py`. + +The important class is `ApplyShardingInterpreter`, an FX interpreter that walks +the original joint graph and inserts the behavior implied by the chosen +placements. + +For each operation, it: + +- looks up the producer specs and target input specs, +- redistributes local tensors when placements differ, +- handles `operator.getitem` specially for tuple outputs, +- localizes shape arguments for tensor factories and view ops, +- wraps view inputs in DTensor in static mode when DTensor should perform + global-to-local shape conversion, +- executes the original op, +- converts DTensor outputs back to local tensors. + +The output is a parallel FX graph that operates on local tensors and explicit +collective/redistribution behavior. + +Relevant files: + +- `autoparallel/apply_sharding.py` +- `autoparallel/shardings/ordered_sharding.py` + +### 14. Parameters and Buffers Are Sharded + +`_shard_params_and_buffers(...)` builds DTensor parameters and buffers from the +solved placements. It uses the original graph's named parameter and buffer +descriptors to map FQNs to FX nodes. + +The returned dictionaries are: + +```text +fqn -> sharded Parameter +fqn -> sharded buffer DTensor +``` + +`make_parallel_module(...)` then constructs the final module. + +Relevant files: + +- `autoparallel/apply_sharding.py` +- `autoparallel/module_construction.py` + +### 15. Parallel Module Construction + +`autoparallel/module_construction.py` creates a new module class that mirrors +the user's original model class. + +It preserves: + +- user-defined instance attributes, +- nested module structure, +- `ModuleDict`-like containers when possible, +- parameter aliases, +- buffer aliases, +- module aliases, +- orphan submodules needed by initialization code. + +It also replaces the module's `forward` with the AutoParallel-generated +function and wraps `init_weights` if the model has one. + +### 16. Runtime Forward + +The generated `forward` in `AutoParallel.apply_placement(...)`: + +1. Flattens user inputs. +2. Validates local runtime shapes and dtypes against traced expectations. +3. Reads DTensor parameters and buffers from the module. +4. Converts parameters and buffers to local tensors. +5. Boxes params, buffers, and runtime inputs into the AOTAutograd-compiled + function. +6. Uses the joint forward/backward function when gradients are enabled. +7. Uses a forward-only extracted graph under `torch.no_grad()`. + +The returned parallel module expects local per-rank tensors at runtime, not +global tensors. + +### 17. Initialization and Loading + +A common workflow is: + +```python +with torch.device("meta"): + model = MyModel(...) + +parallel_model = auto_parallel(...) +parallel_model.to_empty(device="cuda") +parallel_model.init_weights() +``` + +`autoparallel/init_weights.py` makes typical single-GPU initialization code +work with sharded DTensor parameters. It intercepts parameter and buffer +assignments during `init_weights` and copies the assigned full tensor into the +existing DTensor placement. + +Save/load support lives in: + +- `autoparallel/serialization.py` +- `docs/save_load.md` + +## Optional Compilation Pipeline + +The eager parallel module can be passed to: + +```python +torch.compile(parallel_model, backend=autoparallel_backend()) +``` + +`autoparallel/compile.py` wraps Inductor and can enable: + +- activation checkpointing joint pass, +- collective bucketing, +- overlap scheduling, +- insertion of overlap dependencies, +- prefetch limits. + +Activation checkpointing logic is in: + +- `autoparallel/graph_passes/activation_checkpointing.py` + +Other graph and scheduling passes live under: + +- `autoparallel/graph_passes/` +- `autoparallel/graph_passes/async_tp/` +- `autoparallel/graph_passes/autobucketing_inductor/` + +## Important Supporting Areas + +### Custom Ops and Constraints + +`autoparallel/collectives.py` exposes sharding constraints and related +collective helpers. Model authors can use `with_sharding_constraint(...)` inside +model code to force an intermediate placement. + +`autoparallel/ops.py` contains registered AutoParallel-specific operations. + +### Local Map and MoE + +AutoParallel has special handling for `local_map` and MoE-style communication. +Placement options for local-map higher-order ops are generated in +`placement_options.py`, while user-facing examples and explanations are in: + +- `docs/hc_and_moe.md` +- `examples/example_local_map.py` +- `examples/example_dcp.py` +- `examples/native_ds3/` + +### Dynamic Shapes + +When `dynamic=True`, `AutoParallel` traces with symbolic dimensions. The +optimizer still works on a concretized graph, but `apply_sharding` preserves the +original symbolic graph and recreates local fake inputs with fresh symbols for +lowering. Runtime input validation allows dimensions marked dynamic to vary. + +Relevant files: + +- `autoparallel/api.py` +- `autoparallel/optimize_sharding.py` +- `autoparallel/apply_sharding.py` +- `autoparallel/input_validation.py` +- `tests/test_dynamic_shapes.py` + +### JSON and Visualization + +The optimizer can export strategy decisions to JSON with `get_json()`. + +Relevant files: + +- `autoparallel/export_json.py` +- `autoparallel/visualizer/build_display_from_json.py` +- `tests/test_export_json.py` + +## Directory Map + +```text +autoparallel/ + api.py public APIs and orchestration + tracing.py fake tensor conversion and decomposition setup + input_validation.py sample input metadata and runtime checks + optimize_sharding.py ILP optimizer and debugging helpers + apply_sharding.py graph lowering and sharded param creation + module_construction.py final parallel module construction + init_weights.py DTensor-aware init_weights wrapper + compile.py torch.compile backend wrapper + collectives.py sharding constraints and collective helpers + ops.py custom operator registrations + serialization.py optimizer and placement save/load + export_json.py visualization/export format + +autoparallel/shardings/ + placement_options.py per-node strategy generation + propagation_rules.py custom DTensor propagation rules + dtensor_sharding_helpers.py wrappers around DTensor strategy APIs + ordered_sharding.py optimized redistribution ordering + +autoparallel/cost_models/ + compute_estimation.py operation runtime estimates + collective_runtime_estimation.py redistribution cost estimates + nccl_cost_model.py NCCL topology-aware cost model + +autoparallel/graph_passes/ + graph_utils.py graph cleanup and helper analysis + graph_clustering.py repeated-subgraph detection + activation_checkpointing.py recomputation/AC tagging and pass + extract_forward.py forward-only graph extraction + auto_bucketing.py bucketing helpers + async_tp/ async tensor-parallel passes + autobucketing_inductor/ Inductor-oriented bucketing passes + +docs/ user and contributor documentation +examples/ runnable examples +tests/ behavior and regression tests +``` + +## How to Read the Code + +For a first pass, read in this order: + +1. `docs/basic_concepts.md` +2. `docs/api_walkthrough.md` +3. `autoparallel/api.py` +4. `autoparallel/optimize_sharding.py` +5. `autoparallel/shardings/placement_options.py` +6. `autoparallel/shardings/propagation_rules.py` +7. `autoparallel/apply_sharding.py` +8. `autoparallel/module_construction.py` +9. `autoparallel/compile.py` + +Then use tests to understand edge cases: + +- `tests/test_api.py` +- `tests/test_auto_parallel_simple.py` +- `tests/test_optimize_placement.py` +- `tests/test_propagation_rules.py` +- `tests/test_apply_sharding.py` +- `tests/test_dynamic_shapes.py` +- `tests/test_flex_attention.py` +- `tests/test_inference_path.py` + +## Debugging Workflow + +When investigating a model or optimizer decision: + +1. Start with the full `AutoParallel` API instead of `auto_parallel(...)`. +2. Add explicit input and output constraints. +3. Add a parameter memory constraint if you expect FSDP-like sharding. +4. Call `optimize_placement(verbose=True)`. +5. Read the optimizer log for chosen placements and cost breakdowns. +6. Use `print_costs_for_node(...)` for a suspicious node. +7. Use `explain_placement(...)` to compare a target placement with the chosen + placement. +8. Temporarily add a node constraint and compare with `diff_solutions(...)`. +9. Inspect the parallel graph emitted by structured logs or `parallel_gm`. + +Common symptoms: + +- Replicated parameters: missing or loose parameter memory constraint. +- Infeasible ILP: contradictory input/output/node constraints or shard dim too + small for the mesh. +- Unexpected all-gather/all-reduce: producer and consumer placements disagree. +- Shape mismatch at runtime: passing global tensors to a module that expects + local tensors. +- Dynamic-shape compile failure: check whether symbolic dims were concretized + too early or local shape args were not localized. + +## Contributor Notes + +- Prefer existing DTensor strategy APIs before adding custom propagation rules. +- Add custom rules only when the default rule is missing or does not preserve + the metadata AutoParallel needs. +- Keep optimizer constraints explicit; hidden state makes debugging ILP failures + difficult. +- Add focused tests when touching strategy enumeration, cost modeling, + constraints, or graph lowering. +- Be careful with aliases: parameters, buffers, and modules can share identity, + and the code intentionally preserves those relationships. +- The traced graph uses global shapes; the returned module executes on local + tensors. Many bugs come from mixing those two worlds. diff --git a/examples/example_llama3.py b/examples/example_llama3.py index 3903ba8d..5c09bd43 100644 --- a/examples/example_llama3.py +++ b/examples/example_llama3.py @@ -8,10 +8,6 @@ from functools import partial import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Partial, Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs from autoparallel.api import AutoParallel from autoparallel.compile import autoparallel_backend @@ -24,6 +20,9 @@ ) from autoparallel.graph_passes.debug_helpers import make_custom_runtime_estimation from autoparallel.graph_passes.estimate_graph_metrics import estimate_graph_metrics +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Partial, Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore logging.basicConfig(level=logging.DEBUG) diff --git a/profile_results/llama3_3b_ilp_node_indegree_distribution.svg b/profile_results/llama3_3b_ilp_node_indegree_distribution.svg new file mode 100644 index 00000000..d722fd85 --- /dev/null +++ b/profile_results/llama3_3b_ilp_node_indegree_distribution.svg @@ -0,0 +1,51 @@ + + + +AutoParallel ILP Node In-Degree Distribution +LLaMA3 3B, mesh=(64,), repeated_subgraphs=True; raw optimizer DAG, no manual cluster collapse +Nodes excluding output: 7199; unique direct dependency edges: 8805 + +1 + +10 + +100 + +1000 + +10000 + + +direct dependency nodes / in-degree +node count, log scale + +257 +3.57% +0 + +5275 +73.27% +1 + +1611 +22.38% +2 + +28 +0.39% +3 + +28 +0.39% +8 +Histogram: 0->257, 1->5275, 2->1611, 3->28, 8->28 + \ No newline at end of file diff --git a/profile_results/llama3_8b_4x4_strategy_full.json b/profile_results/llama3_8b_4x4_strategy_full.json new file mode 100644 index 00000000..88f58ae3 --- /dev/null +++ b/profile_results/llama3_8b_4x4_strategy_full.json @@ -0,0 +1,287470 @@ +{ + "mesh": { + "dim_names": [ + "dp", + "tp" + ], + "shape": [ + 4, + 4 + ] + }, + "nodes": [ + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "tok_embeddings.weight", + "name": "primals_1", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(1)S(1)", + "shape": [ + 128256, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.attention.wq.weight", + "name": "primals_2", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.attention.wk.weight", + "name": "primals_3", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.attention.wv.weight", + "name": "primals_4", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.attention.wo.weight", + "name": "primals_5", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.feed_forward.w1.weight", + "name": "primals_6", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.feed_forward.w2.weight", + "name": "primals_7", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.feed_forward.w3.weight", + "name": "primals_8", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.attention_norm.weight", + "name": "primals_9", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.0.ffn_norm.weight", + "name": "primals_10", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.attention.wq.weight", + "name": "primals_11", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.attention.wk.weight", + "name": "primals_12", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.attention.wv.weight", + "name": "primals_13", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.attention.wo.weight", + "name": "primals_14", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.feed_forward.w1.weight", + "name": "primals_15", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.feed_forward.w2.weight", + "name": "primals_16", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.feed_forward.w3.weight", + "name": "primals_17", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.attention_norm.weight", + "name": "primals_18", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.1.ffn_norm.weight", + "name": "primals_19", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.attention.wq.weight", + "name": "primals_20", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.attention.wk.weight", + "name": "primals_21", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.attention.wv.weight", + "name": "primals_22", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.attention.wo.weight", + "name": "primals_23", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.feed_forward.w1.weight", + "name": "primals_24", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.feed_forward.w2.weight", + "name": "primals_25", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.feed_forward.w3.weight", + "name": "primals_26", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.attention_norm.weight", + "name": "primals_27", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.2.ffn_norm.weight", + "name": "primals_28", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.attention.wq.weight", + "name": "primals_29", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.attention.wk.weight", + "name": "primals_30", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.attention.wv.weight", + "name": "primals_31", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.attention.wo.weight", + "name": "primals_32", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.feed_forward.w1.weight", + "name": "primals_33", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.feed_forward.w2.weight", + "name": "primals_34", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.feed_forward.w3.weight", + "name": "primals_35", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.attention_norm.weight", + "name": "primals_36", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.3.ffn_norm.weight", + "name": "primals_37", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.attention.wq.weight", + "name": "primals_38", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.attention.wk.weight", + "name": "primals_39", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.attention.wv.weight", + "name": "primals_40", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.attention.wo.weight", + "name": "primals_41", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.feed_forward.w1.weight", + "name": "primals_42", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.feed_forward.w2.weight", + "name": "primals_43", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.feed_forward.w3.weight", + "name": "primals_44", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.attention_norm.weight", + "name": "primals_45", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.4.ffn_norm.weight", + "name": "primals_46", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.attention.wq.weight", + "name": "primals_47", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.attention.wk.weight", + "name": "primals_48", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.attention.wv.weight", + "name": "primals_49", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.attention.wo.weight", + "name": "primals_50", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.feed_forward.w1.weight", + "name": "primals_51", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.feed_forward.w2.weight", + "name": "primals_52", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.feed_forward.w3.weight", + "name": "primals_53", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.attention_norm.weight", + "name": "primals_54", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.5.ffn_norm.weight", + "name": "primals_55", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.attention.wq.weight", + "name": "primals_56", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.attention.wk.weight", + "name": "primals_57", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.attention.wv.weight", + "name": "primals_58", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.attention.wo.weight", + "name": "primals_59", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.feed_forward.w1.weight", + "name": "primals_60", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.feed_forward.w2.weight", + "name": "primals_61", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.feed_forward.w3.weight", + "name": "primals_62", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.attention_norm.weight", + "name": "primals_63", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.6.ffn_norm.weight", + "name": "primals_64", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.attention.wq.weight", + "name": "primals_65", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.attention.wk.weight", + "name": "primals_66", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.attention.wv.weight", + "name": "primals_67", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.attention.wo.weight", + "name": "primals_68", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.feed_forward.w1.weight", + "name": "primals_69", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.feed_forward.w2.weight", + "name": "primals_70", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.feed_forward.w3.weight", + "name": "primals_71", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.attention_norm.weight", + "name": "primals_72", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.7.ffn_norm.weight", + "name": "primals_73", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.attention.wq.weight", + "name": "primals_74", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.attention.wk.weight", + "name": "primals_75", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.attention.wv.weight", + "name": "primals_76", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.attention.wo.weight", + "name": "primals_77", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.feed_forward.w1.weight", + "name": "primals_78", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.feed_forward.w2.weight", + "name": "primals_79", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.feed_forward.w3.weight", + "name": "primals_80", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.attention_norm.weight", + "name": "primals_81", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.8.ffn_norm.weight", + "name": "primals_82", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.attention.wq.weight", + "name": "primals_83", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.attention.wk.weight", + "name": "primals_84", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.attention.wv.weight", + "name": "primals_85", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.attention.wo.weight", + "name": "primals_86", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.feed_forward.w1.weight", + "name": "primals_87", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.feed_forward.w2.weight", + "name": "primals_88", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.feed_forward.w3.weight", + "name": "primals_89", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.attention_norm.weight", + "name": "primals_90", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.9.ffn_norm.weight", + "name": "primals_91", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.attention.wq.weight", + "name": "primals_92", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.attention.wk.weight", + "name": "primals_93", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.attention.wv.weight", + "name": "primals_94", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.attention.wo.weight", + "name": "primals_95", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.feed_forward.w1.weight", + "name": "primals_96", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.feed_forward.w2.weight", + "name": "primals_97", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.feed_forward.w3.weight", + "name": "primals_98", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.attention_norm.weight", + "name": "primals_99", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.10.ffn_norm.weight", + "name": "primals_100", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.attention.wq.weight", + "name": "primals_101", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.attention.wk.weight", + "name": "primals_102", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.attention.wv.weight", + "name": "primals_103", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.attention.wo.weight", + "name": "primals_104", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.feed_forward.w1.weight", + "name": "primals_105", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.feed_forward.w2.weight", + "name": "primals_106", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.feed_forward.w3.weight", + "name": "primals_107", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.attention_norm.weight", + "name": "primals_108", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.11.ffn_norm.weight", + "name": "primals_109", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.attention.wq.weight", + "name": "primals_110", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.attention.wk.weight", + "name": "primals_111", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.attention.wv.weight", + "name": "primals_112", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.attention.wo.weight", + "name": "primals_113", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.feed_forward.w1.weight", + "name": "primals_114", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.feed_forward.w2.weight", + "name": "primals_115", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.feed_forward.w3.weight", + "name": "primals_116", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.attention_norm.weight", + "name": "primals_117", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.12.ffn_norm.weight", + "name": "primals_118", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.attention.wq.weight", + "name": "primals_119", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.attention.wk.weight", + "name": "primals_120", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.attention.wv.weight", + "name": "primals_121", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.attention.wo.weight", + "name": "primals_122", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.feed_forward.w1.weight", + "name": "primals_123", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.feed_forward.w2.weight", + "name": "primals_124", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.feed_forward.w3.weight", + "name": "primals_125", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.attention_norm.weight", + "name": "primals_126", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.13.ffn_norm.weight", + "name": "primals_127", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.attention.wq.weight", + "name": "primals_128", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.attention.wk.weight", + "name": "primals_129", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.attention.wv.weight", + "name": "primals_130", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.attention.wo.weight", + "name": "primals_131", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.feed_forward.w1.weight", + "name": "primals_132", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.feed_forward.w2.weight", + "name": "primals_133", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.feed_forward.w3.weight", + "name": "primals_134", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.attention_norm.weight", + "name": "primals_135", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.14.ffn_norm.weight", + "name": "primals_136", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.attention.wq.weight", + "name": "primals_137", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.attention.wk.weight", + "name": "primals_138", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.attention.wv.weight", + "name": "primals_139", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.attention.wo.weight", + "name": "primals_140", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.feed_forward.w1.weight", + "name": "primals_141", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.feed_forward.w2.weight", + "name": "primals_142", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.feed_forward.w3.weight", + "name": "primals_143", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.attention_norm.weight", + "name": "primals_144", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.15.ffn_norm.weight", + "name": "primals_145", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.attention.wq.weight", + "name": "primals_146", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.attention.wk.weight", + "name": "primals_147", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.attention.wv.weight", + "name": "primals_148", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.attention.wo.weight", + "name": "primals_149", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.feed_forward.w1.weight", + "name": "primals_150", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.feed_forward.w2.weight", + "name": "primals_151", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.feed_forward.w3.weight", + "name": "primals_152", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.attention_norm.weight", + "name": "primals_153", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.16.ffn_norm.weight", + "name": "primals_154", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.attention.wq.weight", + "name": "primals_155", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.attention.wk.weight", + "name": "primals_156", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.attention.wv.weight", + "name": "primals_157", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.attention.wo.weight", + "name": "primals_158", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.feed_forward.w1.weight", + "name": "primals_159", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.feed_forward.w2.weight", + "name": "primals_160", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.feed_forward.w3.weight", + "name": "primals_161", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.attention_norm.weight", + "name": "primals_162", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.17.ffn_norm.weight", + "name": "primals_163", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.attention.wq.weight", + "name": "primals_164", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.attention.wk.weight", + "name": "primals_165", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.attention.wv.weight", + "name": "primals_166", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.attention.wo.weight", + "name": "primals_167", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.feed_forward.w1.weight", + "name": "primals_168", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.feed_forward.w2.weight", + "name": "primals_169", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.feed_forward.w3.weight", + "name": "primals_170", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.attention_norm.weight", + "name": "primals_171", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.18.ffn_norm.weight", + "name": "primals_172", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.attention.wq.weight", + "name": "primals_173", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.attention.wk.weight", + "name": "primals_174", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.attention.wv.weight", + "name": "primals_175", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.attention.wo.weight", + "name": "primals_176", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.feed_forward.w1.weight", + "name": "primals_177", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.feed_forward.w2.weight", + "name": "primals_178", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.feed_forward.w3.weight", + "name": "primals_179", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.attention_norm.weight", + "name": "primals_180", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.19.ffn_norm.weight", + "name": "primals_181", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.attention.wq.weight", + "name": "primals_182", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.attention.wk.weight", + "name": "primals_183", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.attention.wv.weight", + "name": "primals_184", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.attention.wo.weight", + "name": "primals_185", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.feed_forward.w1.weight", + "name": "primals_186", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.feed_forward.w2.weight", + "name": "primals_187", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.feed_forward.w3.weight", + "name": "primals_188", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.attention_norm.weight", + "name": "primals_189", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.20.ffn_norm.weight", + "name": "primals_190", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.attention.wq.weight", + "name": "primals_191", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.attention.wk.weight", + "name": "primals_192", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.attention.wv.weight", + "name": "primals_193", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.attention.wo.weight", + "name": "primals_194", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.feed_forward.w1.weight", + "name": "primals_195", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.feed_forward.w2.weight", + "name": "primals_196", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.feed_forward.w3.weight", + "name": "primals_197", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.attention_norm.weight", + "name": "primals_198", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.21.ffn_norm.weight", + "name": "primals_199", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.attention.wq.weight", + "name": "primals_200", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.attention.wk.weight", + "name": "primals_201", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.attention.wv.weight", + "name": "primals_202", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.attention.wo.weight", + "name": "primals_203", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.feed_forward.w1.weight", + "name": "primals_204", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.feed_forward.w2.weight", + "name": "primals_205", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.feed_forward.w3.weight", + "name": "primals_206", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.attention_norm.weight", + "name": "primals_207", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.22.ffn_norm.weight", + "name": "primals_208", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.attention.wq.weight", + "name": "primals_209", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.attention.wk.weight", + "name": "primals_210", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.attention.wv.weight", + "name": "primals_211", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.attention.wo.weight", + "name": "primals_212", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.feed_forward.w1.weight", + "name": "primals_213", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.feed_forward.w2.weight", + "name": "primals_214", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.feed_forward.w3.weight", + "name": "primals_215", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.attention_norm.weight", + "name": "primals_216", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.23.ffn_norm.weight", + "name": "primals_217", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.attention.wq.weight", + "name": "primals_218", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.attention.wk.weight", + "name": "primals_219", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.attention.wv.weight", + "name": "primals_220", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.attention.wo.weight", + "name": "primals_221", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.feed_forward.w1.weight", + "name": "primals_222", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.feed_forward.w2.weight", + "name": "primals_223", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.feed_forward.w3.weight", + "name": "primals_224", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.attention_norm.weight", + "name": "primals_225", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.24.ffn_norm.weight", + "name": "primals_226", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.attention.wq.weight", + "name": "primals_227", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.attention.wk.weight", + "name": "primals_228", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.attention.wv.weight", + "name": "primals_229", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.attention.wo.weight", + "name": "primals_230", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.feed_forward.w1.weight", + "name": "primals_231", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.feed_forward.w2.weight", + "name": "primals_232", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.feed_forward.w3.weight", + "name": "primals_233", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.attention_norm.weight", + "name": "primals_234", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.25.ffn_norm.weight", + "name": "primals_235", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.attention.wq.weight", + "name": "primals_236", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.attention.wk.weight", + "name": "primals_237", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.attention.wv.weight", + "name": "primals_238", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.attention.wo.weight", + "name": "primals_239", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.feed_forward.w1.weight", + "name": "primals_240", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.feed_forward.w2.weight", + "name": "primals_241", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.feed_forward.w3.weight", + "name": "primals_242", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.attention_norm.weight", + "name": "primals_243", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.26.ffn_norm.weight", + "name": "primals_244", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.attention.wq.weight", + "name": "primals_245", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.attention.wk.weight", + "name": "primals_246", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.attention.wv.weight", + "name": "primals_247", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.attention.wo.weight", + "name": "primals_248", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.feed_forward.w1.weight", + "name": "primals_249", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.feed_forward.w2.weight", + "name": "primals_250", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.feed_forward.w3.weight", + "name": "primals_251", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.attention_norm.weight", + "name": "primals_252", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.27.ffn_norm.weight", + "name": "primals_253", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.attention.wq.weight", + "name": "primals_254", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.attention.wk.weight", + "name": "primals_255", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.attention.wv.weight", + "name": "primals_256", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.attention.wo.weight", + "name": "primals_257", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.feed_forward.w1.weight", + "name": "primals_258", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.feed_forward.w2.weight", + "name": "primals_259", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.feed_forward.w3.weight", + "name": "primals_260", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.attention_norm.weight", + "name": "primals_261", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.28.ffn_norm.weight", + "name": "primals_262", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.attention.wq.weight", + "name": "primals_263", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.attention.wk.weight", + "name": "primals_264", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.attention.wv.weight", + "name": "primals_265", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.attention.wo.weight", + "name": "primals_266", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.feed_forward.w1.weight", + "name": "primals_267", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.feed_forward.w2.weight", + "name": "primals_268", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.feed_forward.w3.weight", + "name": "primals_269", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.attention_norm.weight", + "name": "primals_270", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.29.ffn_norm.weight", + "name": "primals_271", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.attention.wq.weight", + "name": "primals_272", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.attention.wk.weight", + "name": "primals_273", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.attention.wv.weight", + "name": "primals_274", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.attention.wo.weight", + "name": "primals_275", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.feed_forward.w1.weight", + "name": "primals_276", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.feed_forward.w2.weight", + "name": "primals_277", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.feed_forward.w3.weight", + "name": "primals_278", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.attention_norm.weight", + "name": "primals_279", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.30.ffn_norm.weight", + "name": "primals_280", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.attention.wq.weight", + "name": "primals_281", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.attention.wk.weight", + "name": "primals_282", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.attention.wv.weight", + "name": "primals_283", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.attention.wo.weight", + "name": "primals_284", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.feed_forward.w1.weight", + "name": "primals_285", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.feed_forward.w2.weight", + "name": "primals_286", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.feed_forward.w3.weight", + "name": "primals_287", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.attention_norm.weight", + "name": "primals_288", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "layers.31.ffn_norm.weight", + "name": "primals_289", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "norm.weight", + "name": "primals_290", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [], + "module_path": "output.weight", + "name": "primals_291", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "param", + "placement": "S(0)S(0)", + "shape": [ + 128256, + 4096 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [], + "module_path": "freqs_cis", + "name": "primals_292", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "buffer", + "placement": "RR", + "shape": [ + 8192, + 64 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "int64", + "inputs": [], + "name": "primals_293", + "op": "placeholder", + "phase": "forward", + "placeholder_kind": "input", + "placement": "S(0)R", + "shape": [ + 8, + 8192 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [], + "name": "tangents_1", + "op": "placeholder", + "phase": "backward", + "placeholder_kind": "tangent", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 128256 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 76.40578345195063, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(1)S(1)", + "name": "primals_1", + "src_placement": "S(1)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "dtype_cast", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(1)S(1)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "int64", + "inputs": [ + { + "comm_cost": 21.38246153846154, + "dst_placement": "RR", + "name": "primals_293", + "src_placement": "S(0)R", + "transition_cost": 1 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "alias_default_1", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 8, + 8192 + ], + "transition_cost": 1.0 + }, + { + "compute_cost": 38.685829146330285, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(1)S(1)", + "name": "dtype_cast", + "src_placement": "S(1)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_1", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "embedding", + "op": "aten.embedding.default", + "phase": "forward", + "placement": "S(2)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 539 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 0, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_9", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "dtype_cast_1", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 706.2108351658422, + "dst_placement": "S(0)S(1)", + "name": "embedding", + "src_placement": "S(2)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "alias_default_3", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 539 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 1, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_5", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "pow_1", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mean", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "add", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "rsqrt", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_6", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_1", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_4", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_4", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_1", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type_1", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_2", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "dtype_cast_2", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_2", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "permute", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_1", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_7", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "alias_default_8", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_7", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_8", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "einsum_default", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_3", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "dtype_cast_3", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_3", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "permute_1", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "alias_default_9", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_7", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_9", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "einsum_default_1", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_4", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "dtype_cast_4", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_4", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "permute_2", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_2", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "alias_default_10", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_7", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_10", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "einsum_default_2", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_6", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_7", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_8", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_8", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_9", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_complex", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_9", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_10", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_complex_1", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "primals_292", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "name": "alias_default", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 8192, + 64 + ], + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_11", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_11", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "mul_2", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_real", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_12", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "mul_3", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_real_1", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_13", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_10", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_11", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "unsqueeze", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "expand", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "clone", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_14", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "unsqueeze_1", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "expand_1", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "clone_1", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_15", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_3", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_4", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_5", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_12", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_13", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_14", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "_scaled_dot_product_flash_attention", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem_1", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem_6", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem_7", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "alias_default_15", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_6", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_6", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_16", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_5", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "dtype_cast_5", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_5", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "permute_7", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_16", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_7", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "alias_default_17", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_17", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "einsum_default_3", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0", + "name": "add_1", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_10", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "dtype_cast_6", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0", + "name": "alias_default_18", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_14", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_20", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "pow_2", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mean_1", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "add_2", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "rsqrt_1", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_21", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_4", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_6", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_19", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_19", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_5", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_15", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_6", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "dtype_cast_7", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_7", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "permute_8", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_15", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_22", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_8", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "alias_default_23", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_22", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_23", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "einsum_default_4", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "alias_default_24", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "convert_element_type_18", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_25", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "neg", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "exp", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "add_3", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "div", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "convert_element_type_19", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_8", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "dtype_cast_8", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_8", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "permute_9", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_9", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "alias_default_27", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_22", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_27", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "einsum_default_5", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_26", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "alias_default_28", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_6", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "dtype_cast_9", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_9", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "permute_10", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_29", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_10", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "alias_default_30", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_30", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "einsum_default_6", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_6", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0", + "name": "add_4", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_18", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "dtype_cast_10", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0", + "name": "alias_default_31", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_24", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_33", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "pow_3", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mean_2", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "add_5", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "rsqrt_2", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_34", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_7", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_10", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_32", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_32", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_8", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_25", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_11", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "dtype_cast_11", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_11", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "permute_11", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_25", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_35", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_11", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "alias_default_36", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_35", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_36", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "einsum_default_7", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_12", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "dtype_cast_12", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_12", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "permute_12", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_12", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "alias_default_37", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_35", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_37", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "einsum_default_8", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_13", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "dtype_cast_13", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_13", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "permute_13", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_13", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "alias_default_38", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_35", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_38", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "einsum_default_9", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_31", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_32", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_33", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_32", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_34", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_complex_2", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_33", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_35", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_complex_3", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_36", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_36", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_39", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_39", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "mul_9", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_real_2", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_37", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_39", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "mul_10", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_real_3", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_38", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_34", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_35", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "unsqueeze_2", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "expand_2", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "clone_2", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_39", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "unsqueeze_3", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "expand_3", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "clone_3", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_40", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_14", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_15", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_16", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_40", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_41", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_42", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_1", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_9", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_10", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_1", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_15", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_1", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_16", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "alias_default_43", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_17", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_17", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_41", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_14", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "dtype_cast_14", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_14", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "permute_18", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_44", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_18", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "alias_default_45", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_45", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "einsum_default_10", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1", + "name": "add_6", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_19", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "dtype_cast_15", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1", + "name": "alias_default_46", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_38", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_48", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "pow_4", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mean_3", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "add_7", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "rsqrt_3", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_49", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_11", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_15", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_47", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_47", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_12", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_39", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_15", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "dtype_cast_16", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_16", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "permute_19", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_39", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_50", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_19", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "alias_default_51", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_50", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_51", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "einsum_default_11", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "alias_default_52", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "convert_element_type_42", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_53", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "neg_1", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "exp_1", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "add_8", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "div_1", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "convert_element_type_43", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_17", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "dtype_cast_17", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_17", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "permute_20", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_20", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "alias_default_55", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_50", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_55", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "einsum_default_12", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_54", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "alias_default_56", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_13", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "dtype_cast_18", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_18", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "permute_21", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_57", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_21", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "alias_default_58", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_58", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "einsum_default_13", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_13", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1", + "name": "add_9", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_27", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "dtype_cast_19", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1", + "name": "alias_default_59", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_48", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_61", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "pow_5", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mean_4", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "add_10", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "rsqrt_4", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_62", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_14", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_19", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_60", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_60", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_15", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_49", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_20", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "dtype_cast_20", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_20", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "permute_22", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_49", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_63", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_22", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "alias_default_64", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_63", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_64", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "einsum_default_14", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_21", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "dtype_cast_21", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_21", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "permute_23", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_23", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "alias_default_65", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_63", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_65", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "einsum_default_15", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_22", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "dtype_cast_22", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_22", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "permute_24", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_24", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "alias_default_66", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_63", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_66", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "einsum_default_16", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_56", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_57", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_58", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_56", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_59", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_complex_4", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_57", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_60", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_complex_5", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_61", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_61", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_67", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_67", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "mul_16", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_real_4", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_62", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_67", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "mul_17", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_real_5", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_63", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_58", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_59", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "unsqueeze_4", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "expand_4", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "clone_4", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_64", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "unsqueeze_5", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "expand_5", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "clone_5", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_65", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_25", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_64", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_26", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_65", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_27", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_68", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_69", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_70", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_68", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_2", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_18", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_19", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_2", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_24", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_2", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_25", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "alias_default_71", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_28", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_28", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_66", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_23", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "dtype_cast_23", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_23", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "permute_29", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_66", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_72", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_29", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "alias_default_73", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_73", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "einsum_default_17", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2", + "name": "add_11", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_28", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "dtype_cast_24", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2", + "name": "alias_default_74", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_62", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_76", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_76", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "pow_6", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mean_5", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "add_12", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "rsqrt_5", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_77", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_76", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_18", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_24", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_75", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_75", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_19", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_63", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_24", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "dtype_cast_25", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_25", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "permute_30", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_63", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_78", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_30", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "alias_default_79", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_78", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_79", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "einsum_default_18", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "alias_default_80", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_80", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "convert_element_type_66", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_66", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_81", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "neg_2", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "exp_2", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "add_13", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "div_2", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "convert_element_type_67", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_26", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "dtype_cast_26", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_26", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "permute_31", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_31", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "alias_default_83", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_78", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_83", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "einsum_default_19", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_67", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_82", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "alias_default_84", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_20", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "dtype_cast_27", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_27", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "permute_32", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_85", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_32", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "alias_default_86", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_86", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "einsum_default_20", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_20", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2", + "name": "add_14", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_36", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "dtype_cast_28", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2", + "name": "alias_default_87", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_72", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_89", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_89", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "pow_7", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mean_6", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "add_15", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "rsqrt_6", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_90", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_89", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_21", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_28", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_88", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_88", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_22", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_73", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_29", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "dtype_cast_29", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_29", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "permute_33", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_73", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_91", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_33", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "alias_default_92", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_91", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_92", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "einsum_default_21", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_30", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "dtype_cast_30", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_30", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "permute_34", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_34", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "alias_default_93", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_91", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_93", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "einsum_default_22", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_31", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "dtype_cast_31", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_31", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "permute_35", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_35", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "alias_default_94", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_91", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_94", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "einsum_default_23", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_81", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_82", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_83", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_80", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_80", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_84", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_complex_6", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_81", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_85", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_complex_7", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_86", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_86", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_95", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_95", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "mul_23", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_real_6", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_87", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_95", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "mul_24", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_real_7", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_88", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_87", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_82", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_88", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_83", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "unsqueeze_6", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "expand_6", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "clone_6", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_89", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "unsqueeze_7", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "expand_7", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "clone_7", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_90", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_36", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_89", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_37", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_90", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_38", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_96", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_97", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_98", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_97", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_98", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_3", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_27", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_28", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_3", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_33", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_3", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_34", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "alias_default_99", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_99", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_39", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_39", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_91", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_32", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "dtype_cast_32", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_32", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "permute_40", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_100", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_40", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "alias_default_101", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_100", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_101", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "einsum_default_24", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3", + "name": "add_16", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_37", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "dtype_cast_33", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3", + "name": "alias_default_102", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_86", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_86", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_104", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_104", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "pow_8", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mean_7", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "add_17", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "rsqrt_7", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_105", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_104", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_105", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_25", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_33", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_103", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_103", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_26", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_87", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_33", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "dtype_cast_34", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_34", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "permute_41", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_87", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_106", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_41", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "alias_default_107", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_106", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_107", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "einsum_default_25", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "alias_default_108", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "convert_element_type_90", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_90", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_109", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "neg_3", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "exp_3", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "add_18", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "div_3", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "convert_element_type_91", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_35", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "dtype_cast_35", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_35", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "permute_42", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_42", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "alias_default_111", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_106", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_111", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "einsum_default_26", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_91", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_110", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "alias_default_112", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_27", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "dtype_cast_36", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_36", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "permute_43", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_113", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_43", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "alias_default_114", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_114", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "einsum_default_27", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_27", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3", + "name": "add_19", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_45", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "dtype_cast_37", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3", + "name": "alias_default_115", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_96", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_117", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_117", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "pow_9", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mean_8", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "add_20", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "rsqrt_8", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_118", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_117", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_118", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_28", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_37", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_116", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_116", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_29", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_97", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_38", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "dtype_cast_38", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_38", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "permute_44", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_97", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_119", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_44", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "alias_default_120", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_119", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_120", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "einsum_default_28", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_39", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "dtype_cast_39", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_39", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "permute_45", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_45", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "alias_default_121", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_119", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_121", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "einsum_default_29", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_40", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "dtype_cast_40", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_40", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "permute_46", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_46", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "alias_default_122", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_119", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_122", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "einsum_default_30", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_106", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_107", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_108", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_104", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_109", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_complex_8", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_105", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_110", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_complex_9", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_111", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_111", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_123", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_123", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "mul_30", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_real_8", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_112", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_123", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "mul_31", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_real_9", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_113", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_106", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_107", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "unsqueeze_8", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "expand_8", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "clone_8", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_114", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "unsqueeze_9", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "expand_9", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "clone_9", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_115", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_47", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_48", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_115", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_49", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_124", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_125", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_126", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_4", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_36", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_37", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_4", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_42", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_4", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_43", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "alias_default_127", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_50", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_50", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_116", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_41", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "dtype_cast_41", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_41", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "permute_51", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_116", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_128", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_51", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "alias_default_129", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_128", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_129", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "einsum_default_31", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4", + "name": "add_21", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_46", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "dtype_cast_42", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4", + "name": "alias_default_130", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_110", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_110", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_132", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_132", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "pow_10", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mean_9", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "add_22", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "rsqrt_9", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_133", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_132", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_133", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_32", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_42", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_131", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_131", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_33", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_111", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_42", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "dtype_cast_43", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_43", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "permute_52", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_111", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_134", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_52", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "alias_default_135", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_134", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_135", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "einsum_default_32", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "alias_default_136", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_136", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "convert_element_type_114", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_137", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "neg_4", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "exp_4", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "add_23", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "div_4", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "convert_element_type_115", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_44", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "dtype_cast_44", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_44", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "permute_53", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_53", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "alias_default_139", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_134", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_139", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "einsum_default_33", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_115", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_138", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "alias_default_140", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_140", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_34", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "dtype_cast_45", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_45", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "permute_54", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_141", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_54", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "alias_default_142", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_141", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_142", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "einsum_default_34", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_34", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4", + "name": "add_24", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_54", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "dtype_cast_46", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4", + "name": "alias_default_143", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_143", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_120", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_120", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_145", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "pow_11", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mean_10", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "add_25", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "rsqrt_10", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_146", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_35", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_46", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_144", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_144", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_36", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_121", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_47", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "dtype_cast_47", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_47", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "permute_55", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_121", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_147", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_55", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "alias_default_148", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_147", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_148", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "einsum_default_35", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_48", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "dtype_cast_48", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_48", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "permute_56", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_56", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "alias_default_149", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_147", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_149", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "einsum_default_36", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_49", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "dtype_cast_49", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_49", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "permute_57", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_57", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "alias_default_150", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_147", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_150", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "einsum_default_37", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_131", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_132", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_133", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_131", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_128", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_128", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_134", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_134", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_complex_10", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_132", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_129", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_129", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_135", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_135", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_complex_11", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_136", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_136", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_151", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "mul_37", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_real_10", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_137", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "mul_38", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_real_11", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_138", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_130", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_131", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_131", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "unsqueeze_10", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "expand_10", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "clone_10", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_139", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_133", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "unsqueeze_11", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "expand_11", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "clone_11", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_140", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_130", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_58", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_139", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_59", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_140", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_60", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_58", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_152", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_153", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_154", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_152", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_153", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_5", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_45", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_46", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_5", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_51", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_5", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_52", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "alias_default_155", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_155", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_61", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_61", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_141", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_50", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "dtype_cast_50", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_50", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "permute_62", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_156", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_62", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "alias_default_157", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_157", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "einsum_default_38", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_143", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5", + "name": "add_26", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_55", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "dtype_cast_51", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5", + "name": "alias_default_158", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_158", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_134", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_134", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_160", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_160", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "pow_12", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mean_11", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "add_27", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "rsqrt_11", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_161", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_160", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_161", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_39", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_51", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_159", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_159", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_40", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_135", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_51", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "dtype_cast_52", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_52", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "permute_63", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_135", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_162", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_63", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "alias_default_163", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_162", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_163", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "einsum_default_39", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "alias_default_164", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_164", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "convert_element_type_138", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_165", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "neg_5", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "exp_5", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "add_28", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "div_5", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "convert_element_type_139", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_53", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "dtype_cast_53", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_53", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "permute_64", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_64", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "alias_default_167", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_162", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_167", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "einsum_default_40", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_139", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_166", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "alias_default_168", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_166", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_41", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "dtype_cast_54", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_54", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "permute_65", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_169", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_65", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "alias_default_170", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_169", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_170", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "einsum_default_41", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_158", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_41", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5", + "name": "add_29", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_63", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "dtype_cast_55", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5", + "name": "alias_default_171", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_144", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_144", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_173", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "pow_13", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mean_12", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "add_30", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "rsqrt_12", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_174", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_174", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_42", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_55", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_172", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_172", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_43", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_145", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_56", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "dtype_cast_56", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_56", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "permute_66", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_145", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_175", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_66", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "alias_default_176", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_175", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_176", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "einsum_default_42", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_57", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "dtype_cast_57", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_57", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "permute_67", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_67", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "alias_default_177", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_175", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_177", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "einsum_default_43", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_58", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "dtype_cast_58", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_58", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "permute_68", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_68", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "alias_default_178", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_175", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_178", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "einsum_default_44", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_156", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_157", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_158", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_156", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_152", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_152", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_159", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_159", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_complex_12", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_157", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_153", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_153", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_160", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_160", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_complex_13", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_161", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_161", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_179", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_179", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "mul_44", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_real_12", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_162", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_179", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "mul_45", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_real_13", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_163", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_162", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_154", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_163", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_155", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_155", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "unsqueeze_12", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "expand_12", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "clone_12", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_164", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_158", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "unsqueeze_13", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "expand_13", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "clone_13", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_165", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_154", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_69", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_164", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_70", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_71", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_180", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_181", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_182", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_180", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_181", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_182", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_6", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_54", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_55", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_6", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_60", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_6", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_61", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "alias_default_183", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_183", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_72", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_72", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_166", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_59", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "dtype_cast_59", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_59", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "permute_73", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_166", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_184", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_73", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "alias_default_185", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_184", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_185", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "einsum_default_45", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6", + "name": "add_31", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_64", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "dtype_cast_60", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6", + "name": "alias_default_186", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_158", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_158", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_188", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_188", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "pow_14", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mean_13", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "add_32", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "rsqrt_13", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_189", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_188", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_46", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_60", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_187", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_187", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_47", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_159", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_60", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "dtype_cast_61", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_61", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "permute_74", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_159", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_190", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_74", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "alias_default_191", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_190", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_191", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "einsum_default_46", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "alias_default_192", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_192", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "convert_element_type_162", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_162", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_193", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_193", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "neg_6", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "exp_6", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "add_33", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_193", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "div_6", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "convert_element_type_163", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_62", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "dtype_cast_62", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_62", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "permute_75", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_75", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "alias_default_195", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_190", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_195", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "einsum_default_47", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_163", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_194", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "alias_default_196", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_194", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_196", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_48", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "dtype_cast_63", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_63", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "permute_76", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_197", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_76", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "alias_default_198", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_198", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "einsum_default_48", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_48", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6", + "name": "add_34", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_72", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "dtype_cast_64", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6", + "name": "alias_default_199", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_199", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_168", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_168", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_201", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_201", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "pow_15", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mean_14", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "add_35", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "rsqrt_14", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_202", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_201", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_49", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_64", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_200", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_200", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_50", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_169", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_65", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "dtype_cast_65", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_65", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "permute_77", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_169", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_203", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_77", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "alias_default_204", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_203", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_204", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "einsum_default_49", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_66", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "dtype_cast_66", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_66", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "permute_78", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_78", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "alias_default_205", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_203", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_205", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "einsum_default_50", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_67", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "dtype_cast_67", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_67", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "permute_79", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_79", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "alias_default_206", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_203", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_206", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "einsum_default_51", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_181", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_182", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_183", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_181", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_176", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_176", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_184", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_184", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_complex_14", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_182", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_177", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_177", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_185", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_185", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_complex_15", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_186", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_186", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_207", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_207", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "mul_51", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_real_14", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_187", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_207", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "mul_52", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_real_15", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_188", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_187", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_178", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_188", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_179", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_179", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "unsqueeze_14", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "expand_14", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "clone_14", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_189", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_183", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "unsqueeze_15", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "expand_15", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "clone_15", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_190", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_178", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_80", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_189", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_81", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_190", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_82", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_80", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_208", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_209", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_82", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_210", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_208", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_209", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_210", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_7", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_63", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_64", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_7", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_69", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_7", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_70", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "alias_default_211", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_211", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_83", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_83", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_191", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_68", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "dtype_cast_68", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_68", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "permute_84", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_191", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_212", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_84", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "alias_default_213", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_212", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_213", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "einsum_default_52", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_199", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7", + "name": "add_36", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_73", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "dtype_cast_69", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7", + "name": "alias_default_214", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_182", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_182", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_216", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_216", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "pow_16", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mean_15", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "add_37", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "rsqrt_15", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_217", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_216", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_217", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_53", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_69", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_215", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_53", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_215", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_54", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_183", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_69", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "dtype_cast_70", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_70", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "permute_85", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_183", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_218", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_85", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "alias_default_219", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_218", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_219", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "einsum_default_53", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "alias_default_220", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_220", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "convert_element_type_186", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_186", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_221", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_221", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "neg_7", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "exp_7", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "add_38", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_221", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "div_7", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "convert_element_type_187", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_71", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "dtype_cast_71", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_71", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "permute_86", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_86", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "alias_default_223", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_218", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_223", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "einsum_default_54", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_187", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_222", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "alias_default_224", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_222", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_55", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "dtype_cast_72", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_72", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "permute_87", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_225", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_87", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "alias_default_226", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_225", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_226", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "einsum_default_55", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_55", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7", + "name": "add_39", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_81", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "dtype_cast_73", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7", + "name": "alias_default_227", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_192", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_192", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_229", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_229", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "pow_17", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mean_16", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "add_40", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "rsqrt_16", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_230", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_229", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_230", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_56", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_73", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_228", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_228", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_57", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_193", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_74", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "dtype_cast_74", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_74", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "permute_88", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_193", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_231", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_88", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "alias_default_232", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_231", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_232", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "einsum_default_56", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_75", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "dtype_cast_75", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_75", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "permute_89", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_89", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "alias_default_233", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_231", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_233", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "einsum_default_57", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_76", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "dtype_cast_76", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_76", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "permute_90", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_90", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "alias_default_234", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_231", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_234", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "einsum_default_58", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_206", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_207", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_208", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_206", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_200", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_200", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_209", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_209", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_complex_16", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_207", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_201", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_201", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_210", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_210", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_complex_17", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_211", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_211", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_235", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_235", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "mul_58", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_real_16", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_212", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_235", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "mul_59", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_real_17", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_213", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_212", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_202", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_213", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_203", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_203", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "unsqueeze_16", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "expand_16", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "clone_16", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_214", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_208", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "unsqueeze_17", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "expand_17", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "clone_17", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_215", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_202", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_91", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_214", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_92", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_215", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_93", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_236", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_92", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_237", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_93", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_238", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_237", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_238", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_8", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_72", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_73", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_8", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_78", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_8", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_79", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "alias_default_239", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_239", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_94", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_94", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_216", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_77", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "dtype_cast_77", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_77", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "permute_95", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_216", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_240", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_95", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "alias_default_241", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_240", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_241", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "einsum_default_59", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8", + "name": "add_41", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_82", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "dtype_cast_78", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8", + "name": "alias_default_242", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_206", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_206", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_244", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_244", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "pow_18", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mean_17", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "add_42", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "rsqrt_17", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_245", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_244", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_245", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_60", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_78", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_243", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_243", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_61", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_207", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_78", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "dtype_cast_79", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_79", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "permute_96", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_207", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_246", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_96", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "alias_default_247", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_246", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_247", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "einsum_default_60", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "alias_default_248", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_248", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "convert_element_type_210", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_210", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_249", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_249", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "neg_8", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "exp_8", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "add_43", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_249", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "div_8", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "convert_element_type_211", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_80", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "dtype_cast_80", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_80", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "permute_97", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_97", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "alias_default_251", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_246", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_251", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "einsum_default_61", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_211", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_250", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "alias_default_252", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_250", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_252", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_62", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_79", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "dtype_cast_81", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_81", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "permute_98", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_253", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_98", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "alias_default_254", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_253", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_254", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "einsum_default_62", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_62", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8", + "name": "add_44", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_90", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "dtype_cast_82", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8", + "name": "alias_default_255", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_255", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_216", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_216", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_257", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_257", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "pow_19", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mean_18", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "add_45", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "rsqrt_18", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_258", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_257", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_63", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_82", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_256", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_256", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_64", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_217", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_83", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "dtype_cast_83", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_83", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "permute_99", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_217", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_259", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_99", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "alias_default_260", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_259", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_260", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "einsum_default_63", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_84", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "dtype_cast_84", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_84", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "permute_100", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_100", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "alias_default_261", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_259", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_261", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "einsum_default_64", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_85", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "dtype_cast_85", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_85", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "permute_101", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_101", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "alias_default_262", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_259", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_262", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "einsum_default_65", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_231", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_64", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_232", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_65", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_233", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_231", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_224", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_234", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_234", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_complex_18", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_232", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_225", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_225", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_235", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_235", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_complex_19", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_236", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_236", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_263", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_263", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "mul_65", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_65", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_real_18", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_237", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_263", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "mul_66", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_66", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_real_19", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_238", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_237", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_226", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_238", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_227", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_227", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "unsqueeze_18", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "expand_18", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "clone_18", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_239", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_233", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "unsqueeze_19", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "expand_19", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "clone_19", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_240", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_226", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_102", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_239", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_103", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_240", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_104", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_264", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_103", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_265", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_104", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_266", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_264", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_265", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_9", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_81", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_82", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_9", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_87", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_9", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_88", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "alias_default_267", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_267", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_105", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_105", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_241", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_86", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "dtype_cast_86", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_86", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "permute_106", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_241", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_268", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_106", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "alias_default_269", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_269", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "einsum_default_66", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_255", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_66", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9", + "name": "add_46", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_91", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "dtype_cast_87", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9", + "name": "alias_default_270", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_270", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_230", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_230", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_272", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_272", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "pow_20", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mean_19", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "add_47", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "rsqrt_19", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_273", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_272", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_273", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_67", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_87", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_271", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_67", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_271", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_68", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_68", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_231", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_87", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "dtype_cast_88", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_88", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "permute_107", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_231", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_274", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_107", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "alias_default_275", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_274", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_275", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "einsum_default_67", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_67", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "alias_default_276", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_276", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "convert_element_type_234", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_234", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_277", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_277", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "neg_9", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "exp_9", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "add_48", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_277", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "div_9", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "convert_element_type_235", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_89", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "dtype_cast_89", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_89", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "permute_108", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_108", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "alias_default_279", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_274", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_279", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "einsum_default_68", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_235", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_278", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_68", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "alias_default_280", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_278", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_280", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_69", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_88", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "dtype_cast_90", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_90", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "permute_109", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_69", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_281", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_109", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "alias_default_282", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_281", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_282", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "einsum_default_69", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_270", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_69", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9", + "name": "add_49", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_99", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "dtype_cast_91", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9", + "name": "alias_default_283", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_240", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_240", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_285", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_285", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "pow_21", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mean_20", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "add_50", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "rsqrt_20", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_286", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_285", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_286", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_70", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_91", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_284", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_284", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_71", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_241", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_92", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "dtype_cast_92", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_92", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "permute_110", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_241", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_287", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_110", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "alias_default_288", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_287", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_288", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "einsum_default_70", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_93", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "dtype_cast_93", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_93", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "permute_111", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_111", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "alias_default_289", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_287", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_289", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "einsum_default_71", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_94", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "dtype_cast_94", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_94", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "permute_112", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_112", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "alias_default_290", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_287", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_290", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "einsum_default_72", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_70", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_256", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_71", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_257", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_72", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_258", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_256", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_248", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_248", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_259", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_259", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_complex_20", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_257", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_249", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_249", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_260", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_260", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_complex_21", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_261", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_261", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_291", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_291", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "mul_72", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_72", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_real_20", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_262", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_291", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "mul_73", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_73", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_real_21", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_263", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_262", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_250", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_263", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_251", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_251", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "unsqueeze_20", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "expand_20", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "clone_20", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_264", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_258", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "unsqueeze_21", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "expand_21", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "clone_21", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_265", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_250", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_113", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_264", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_114", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_265", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_115", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_113", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_292", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_114", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_293", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_294", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_292", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_293", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_294", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_10", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_90", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_91", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_10", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_96", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_10", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_97", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "alias_default_295", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_295", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_116", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_116", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_266", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_95", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "dtype_cast_95", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_95", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "permute_117", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_296", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_117", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "alias_default_297", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_296", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_297", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "einsum_default_73", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_73", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10", + "name": "add_51", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_100", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "dtype_cast_96", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10", + "name": "alias_default_298", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_254", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_254", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_300", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "pow_22", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mean_21", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "add_52", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "rsqrt_21", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_301", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_74", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_96", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_299", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_299", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_75", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_75", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_255", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_96", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "dtype_cast_97", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_97", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "permute_118", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_255", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_302", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_118", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "alias_default_303", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_302", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_303", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "einsum_default_74", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_74", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "alias_default_304", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_304", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "convert_element_type_258", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_258", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_305", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_305", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "neg_10", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "exp_10", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "add_53", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_305", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "div_10", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "convert_element_type_259", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_98", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "dtype_cast_98", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_98", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "permute_119", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_119", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "alias_default_307", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_302", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_307", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "einsum_default_75", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_259", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_306", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_75", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "alias_default_308", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_306", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_308", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_76", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_97", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "dtype_cast_99", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_99", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "permute_120", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_76", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_309", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_120", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "alias_default_310", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_309", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_310", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "einsum_default_76", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_76", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10", + "name": "add_54", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_108", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "dtype_cast_100", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10", + "name": "alias_default_311", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_264", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_264", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_313", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_313", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "pow_23", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mean_22", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "add_55", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "rsqrt_22", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_314", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_313", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_314", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_77", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_100", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_312", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_312", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_78", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_78", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_265", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_101", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "dtype_cast_101", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_101", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "permute_121", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_265", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_315", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_121", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "alias_default_316", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_315", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_316", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "einsum_default_77", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_102", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "dtype_cast_102", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_102", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "permute_122", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_122", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "alias_default_317", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_315", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_317", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "einsum_default_78", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_103", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "dtype_cast_103", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_103", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "permute_123", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_123", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "alias_default_318", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_315", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_318", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "einsum_default_79", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_77", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_281", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_78", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_282", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_79", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_283", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_281", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_272", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_272", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_284", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_284", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_complex_22", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_282", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_273", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_273", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_285", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_285", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_complex_23", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_286", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_286", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_319", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_319", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "mul_79", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_79", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_real_22", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_287", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_319", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "mul_80", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_80", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_real_23", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_288", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_287", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_274", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_288", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_275", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_275", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "unsqueeze_22", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "expand_22", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "clone_22", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_289", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_283", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "unsqueeze_23", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "expand_23", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "clone_23", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_290", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_274", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_124", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_289", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_125", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_290", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_126", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_320", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_321", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_322", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_320", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_321", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_11", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_99", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_100", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_105", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_106", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_99", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "alias_default_323", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_127", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_127", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_291", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_104", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "dtype_cast_104", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_104", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "permute_128", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_324", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_128", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "alias_default_325", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_324", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_325", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "einsum_default_80", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_80", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11", + "name": "add_56", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_109", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "dtype_cast_105", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11", + "name": "alias_default_326", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_278", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_278", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_328", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_328", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "pow_24", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mean_23", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "add_57", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "rsqrt_23", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_329", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_328", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_329", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_81", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_105", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_327", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_327", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_82", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_82", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_279", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_105", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "dtype_cast_106", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_106", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "permute_129", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_279", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_330", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_129", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "alias_default_331", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_330", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_331", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "einsum_default_81", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "alias_default_332", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_332", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "convert_element_type_282", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_282", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_333", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_333", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "neg_11", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "exp_11", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "add_58", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_333", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "div_11", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "convert_element_type_283", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_107", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "dtype_cast_107", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_107", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "permute_130", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_130", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "alias_default_335", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_330", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_335", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "einsum_default_82", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_283", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_334", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "alias_default_336", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_334", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_336", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_83", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_106", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "dtype_cast_108", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_108", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "permute_131", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_337", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_131", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "alias_default_338", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_337", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_338", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "einsum_default_83", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_83", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11", + "name": "add_59", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_117", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "dtype_cast_109", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11", + "name": "alias_default_339", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_288", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_288", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_341", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_341", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "pow_25", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mean_24", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "add_60", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "rsqrt_24", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_342", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_341", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_342", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_84", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_109", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_340", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_84", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_340", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_85", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_85", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_289", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_110", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "dtype_cast_110", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_110", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "permute_132", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_289", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_343", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_132", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "alias_default_344", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_343", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_344", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "einsum_default_84", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_111", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "dtype_cast_111", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_111", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "permute_133", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_133", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "alias_default_345", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_343", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_345", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "einsum_default_85", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_112", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "dtype_cast_112", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_112", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "permute_134", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_134", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "alias_default_346", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_343", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_346", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "einsum_default_86", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_306", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_307", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_86", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_308", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_306", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_296", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_296", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_309", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_309", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_complex_24", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_307", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_297", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_297", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_310", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_310", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_complex_25", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_311", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_311", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_347", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_347", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "mul_86", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_86", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_real_24", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_312", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_347", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "mul_87", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_87", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_real_25", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_313", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_312", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_298", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_313", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_299", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_299", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "unsqueeze_24", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "expand_24", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "clone_24", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_314", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_308", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "unsqueeze_25", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "expand_25", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "clone_25", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_315", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_298", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_135", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_314", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_136", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_315", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_137", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_135", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_348", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_136", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_349", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_137", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_350", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_350", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_12", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_108", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_109", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_12", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_114", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_12", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_115", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_108", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "alias_default_351", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_351", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_138", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_138", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_316", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_113", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "dtype_cast_113", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_113", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "permute_139", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_316", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_352", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_139", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "alias_default_353", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_352", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_353", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "einsum_default_87", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12", + "name": "add_61", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_118", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "dtype_cast_114", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12", + "name": "alias_default_354", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_302", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_302", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_356", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_356", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "pow_26", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mean_25", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "add_62", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "rsqrt_25", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_357", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_356", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_357", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_88", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_114", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_355", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_88", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_355", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_89", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_89", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_303", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_114", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "dtype_cast_115", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_115", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "permute_140", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_303", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_358", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_140", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "alias_default_359", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_358", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_359", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "einsum_default_88", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_88", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "alias_default_360", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_360", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "convert_element_type_306", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_306", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_361", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_361", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "neg_12", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "exp_12", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "add_63", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_361", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "div_12", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "convert_element_type_307", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_116", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "dtype_cast_116", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_116", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "permute_141", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_141", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "alias_default_363", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_358", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_363", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "einsum_default_89", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_307", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_362", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_89", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "alias_default_364", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_362", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_364", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_90", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "dtype_cast_117", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_117", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "permute_142", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_90", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_365", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_142", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "alias_default_366", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_365", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_366", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "einsum_default_90", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_90", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12", + "name": "add_64", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_126", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "dtype_cast_118", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12", + "name": "alias_default_367", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_367", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_312", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_312", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_369", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "pow_27", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mean_26", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "add_65", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_65", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "rsqrt_26", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_370", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_91", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_118", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_368", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_368", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_92", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_92", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_313", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_119", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "dtype_cast_119", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_119", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "permute_143", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_313", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_371", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_143", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "alias_default_372", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_371", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_372", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "einsum_default_91", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_120", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "dtype_cast_120", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_120", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "permute_144", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_144", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "alias_default_373", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_371", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_373", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "einsum_default_92", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_121", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "dtype_cast_121", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_121", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "permute_145", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_145", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "alias_default_374", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_371", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_374", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "einsum_default_93", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_91", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_331", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_92", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_332", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_93", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_333", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_331", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_320", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_320", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_334", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_334", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_complex_26", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_332", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_321", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_321", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_335", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_335", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_complex_27", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_336", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_336", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_375", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_375", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "mul_93", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_93", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_real_26", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_337", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_375", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "mul_94", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_94", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_real_27", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_338", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_337", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_322", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_338", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_323", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_323", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "unsqueeze_26", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "expand_26", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "clone_26", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_339", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_333", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "unsqueeze_27", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "expand_27", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "clone_27", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_340", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_322", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_146", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_339", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_147", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_340", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_148", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_376", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_377", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_148", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_378", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_376", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_377", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_13", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_117", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_118", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_13", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_123", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_13", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_124", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_117", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "alias_default_379", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_379", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_149", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_149", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_341", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_122", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "dtype_cast_122", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_122", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "permute_150", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_341", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_380", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_150", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "alias_default_381", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_381", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "einsum_default_94", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_367", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_94", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13", + "name": "add_66", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_127", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "dtype_cast_123", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_66", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13", + "name": "alias_default_382", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_326", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_384", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_384", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "pow_28", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mean_27", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "add_67", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_67", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "rsqrt_27", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_385", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_384", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_385", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_95", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_123", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_383", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_95", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_383", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_96", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_327", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_123", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "dtype_cast_124", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_124", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "permute_151", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_327", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_386", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_151", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "alias_default_387", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_386", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_387", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "einsum_default_95", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_95", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "alias_default_388", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_388", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "convert_element_type_330", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_330", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_389", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_389", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "neg_13", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "exp_13", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "add_68", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_389", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_68", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "div_13", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "convert_element_type_331", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_125", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "dtype_cast_125", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_125", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "permute_152", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_152", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "alias_default_391", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_386", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_391", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "einsum_default_96", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_331", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_390", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_96", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "alias_default_392", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_97", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "dtype_cast_126", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_126", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "permute_153", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_97", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_393", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_153", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "alias_default_394", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_394", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "einsum_default_97", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_97", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13", + "name": "add_69", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_135", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "dtype_cast_127", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13", + "name": "alias_default_395", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_395", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_336", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_336", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_397", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_397", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "pow_29", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mean_28", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "add_70", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "rsqrt_28", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_398", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_397", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_398", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_98", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_127", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_396", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_98", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_396", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_99", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_99", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_337", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_128", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "dtype_cast_128", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_128", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "permute_154", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_337", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_399", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_154", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "alias_default_400", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_399", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_400", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "einsum_default_98", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_129", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "dtype_cast_129", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_129", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "permute_155", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_155", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "alias_default_401", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_399", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_401", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "einsum_default_99", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_130", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "dtype_cast_130", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_130", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "permute_156", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_156", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "alias_default_402", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_399", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_402", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "einsum_default_100", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_98", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_356", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_99", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_357", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_100", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_358", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_356", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_344", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_344", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_359", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_359", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_complex_28", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_357", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_345", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_345", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_360", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_360", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_complex_29", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_361", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_361", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_403", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_403", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "mul_100", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_100", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_real_28", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_362", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_403", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "mul_101", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_real_29", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_363", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_362", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_346", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_363", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_347", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_347", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "unsqueeze_28", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "expand_28", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "clone_28", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_364", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_358", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "unsqueeze_29", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "expand_29", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "clone_29", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_365", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_346", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_157", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_364", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_158", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_365", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_159", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_404", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_158", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_405", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_159", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_406", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_404", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_405", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_406", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_14", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_126", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_127", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_14", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_132", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_14", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_133", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "alias_default_407", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_407", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_160", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_160", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_366", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_131", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "dtype_cast_131", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_131", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "permute_161", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_366", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_408", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_161", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "alias_default_409", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_408", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_409", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "einsum_default_101", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_395", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_101", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14", + "name": "add_71", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_136", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "dtype_cast_132", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14", + "name": "alias_default_410", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_410", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_350", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_350", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_412", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_412", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "pow_30", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mean_29", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "add_72", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "rsqrt_29", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_413", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_412", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_413", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_102", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_132", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_411", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_411", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_103", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_103", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_351", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_132", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "dtype_cast_133", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_133", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "permute_162", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_351", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_414", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_162", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "alias_default_415", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_414", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_415", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "einsum_default_102", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "alias_default_416", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_416", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "convert_element_type_354", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_354", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_417", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_417", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "neg_14", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "exp_14", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "add_73", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_417", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_73", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "div_14", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "convert_element_type_355", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_134", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "dtype_cast_134", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_134", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "permute_163", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_163", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "alias_default_419", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_414", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_419", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "einsum_default_103", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_355", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_418", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "alias_default_420", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_418", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_420", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_104", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_133", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "dtype_cast_135", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_135", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "permute_164", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_421", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_164", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "alias_default_422", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_421", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_422", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "einsum_default_104", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_410", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_104", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14", + "name": "add_74", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_144", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "dtype_cast_136", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14", + "name": "alias_default_423", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_423", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_360", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_360", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_425", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_425", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "pow_31", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mean_30", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "add_75", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_75", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "rsqrt_30", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_426", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_425", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_426", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_105", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_136", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_424", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_105", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_424", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_106", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_106", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_361", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_137", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "dtype_cast_137", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_137", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "permute_165", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_361", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_427", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_165", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "alias_default_428", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_427", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_428", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "einsum_default_105", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_138", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "dtype_cast_138", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_138", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "permute_166", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_166", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "alias_default_429", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_427", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_429", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "einsum_default_106", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_139", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "dtype_cast_139", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_139", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "permute_167", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_167", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "alias_default_430", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_427", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_430", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "einsum_default_107", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_381", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_382", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_383", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_381", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_368", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_368", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_384", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_384", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_complex_30", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_382", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_369", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_369", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_385", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_385", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_complex_31", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_386", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_386", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_431", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_431", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "mul_107", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_real_30", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_387", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_431", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "mul_108", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_real_31", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_388", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_387", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_370", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_388", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_371", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_371", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "unsqueeze_30", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "expand_30", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "clone_30", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_389", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_383", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "unsqueeze_31", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "expand_31", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "clone_31", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_390", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_370", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_168", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_389", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_169", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_170", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_168", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_432", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_169", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_433", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_170", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_434", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_432", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_433", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_434", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_15", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_135", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_136", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_15", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_141", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_15", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_142", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_135", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "alias_default_435", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_435", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_171", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_171", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_391", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_140", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "dtype_cast_140", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_140", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "permute_172", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_391", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_436", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_172", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "alias_default_437", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_436", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_437", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "einsum_default_108", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_423", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_108", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15", + "name": "add_76", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_145", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "dtype_cast_141", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_76", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15", + "name": "alias_default_438", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_438", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_374", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_374", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_440", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_440", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "pow_32", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mean_31", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "add_77", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "rsqrt_31", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_441", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_440", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_441", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_109", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_141", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_439", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_439", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_110", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_110", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_375", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_141", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "dtype_cast_142", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_142", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "permute_173", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_375", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_442", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_173", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "alias_default_443", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_442", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_443", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "einsum_default_109", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "alias_default_444", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_444", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "convert_element_type_378", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_378", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_445", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_445", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "neg_15", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "exp_15", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "add_78", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_445", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_78", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "div_15", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "convert_element_type_379", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_143", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "dtype_cast_143", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_143", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "permute_174", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_174", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "alias_default_447", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_442", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_447", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "einsum_default_110", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_379", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_446", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "alias_default_448", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_446", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_448", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_111", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_142", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "dtype_cast_144", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_144", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "permute_175", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_111", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_449", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_175", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "alias_default_450", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_449", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_450", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "einsum_default_111", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_438", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_111", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15", + "name": "add_79", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_153", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "dtype_cast_145", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_79", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15", + "name": "alias_default_451", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_451", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_384", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_384", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_453", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_453", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "pow_33", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mean_32", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "add_80", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_80", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "rsqrt_32", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_454", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_453", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_454", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_112", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_145", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_452", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_112", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_452", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_113", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_113", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_385", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_146", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "dtype_cast_146", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_146", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "permute_176", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_385", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_455", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_176", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "alias_default_456", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_455", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_456", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "einsum_default_112", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_147", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "dtype_cast_147", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_147", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "permute_177", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_177", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "alias_default_457", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_455", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_457", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "einsum_default_113", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_148", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "dtype_cast_148", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_148", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "permute_178", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_178", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "alias_default_458", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_455", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_458", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "einsum_default_114", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_406", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_407", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_408", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_406", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_392", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_409", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_409", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_complex_32", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_407", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_393", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_410", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_410", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_complex_33", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_411", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_411", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_459", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_459", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "mul_114", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_real_32", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_412", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_459", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "mul_115", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_115", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_real_33", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_413", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_412", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_394", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_413", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_395", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_395", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "unsqueeze_32", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "expand_32", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "clone_32", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_414", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_408", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "unsqueeze_33", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "expand_33", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "clone_33", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_415", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_394", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_179", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_414", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_180", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_415", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_181", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_179", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_460", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_180", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_461", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_181", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_462", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_460", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_461", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_462", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_16", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_144", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_145", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_16", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_150", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_16", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_151", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_144", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "alias_default_463", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_463", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_182", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_182", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_416", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_149", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "dtype_cast_149", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_149", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "permute_183", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_416", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_464", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_183", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "alias_default_465", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_464", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_465", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "einsum_default_115", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_451", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16", + "name": "add_81", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_154", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "dtype_cast_150", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16", + "name": "alias_default_466", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_466", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_398", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_398", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_468", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_468", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "pow_34", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mean_33", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "add_82", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_82", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "rsqrt_33", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_469", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_468", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_469", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_116", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_150", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_467", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_116", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_467", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_117", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_117", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_399", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_150", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "dtype_cast_151", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_151", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "permute_184", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_399", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_470", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_184", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "alias_default_471", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_470", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_471", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "einsum_default_116", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_116", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "alias_default_472", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_472", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "convert_element_type_402", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_402", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_473", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_473", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "neg_16", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "exp_16", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "add_83", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_473", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "div_16", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "convert_element_type_403", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_152", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "dtype_cast_152", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_152", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "permute_185", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_185", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "alias_default_475", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_470", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_475", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "einsum_default_117", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_403", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_474", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_117", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "alias_default_476", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_474", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_476", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_118", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_151", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "dtype_cast_153", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_153", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "permute_186", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_118", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_477", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_186", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "alias_default_478", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_477", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_478", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "einsum_default_118", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_466", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_118", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16", + "name": "add_84", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_162", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "dtype_cast_154", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_84", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16", + "name": "alias_default_479", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_479", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_408", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_408", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_481", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_481", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "pow_35", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mean_34", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "add_85", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_85", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "rsqrt_34", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_482", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_481", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_482", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_119", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_154", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_480", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_119", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_480", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_120", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_120", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_409", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_155", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "dtype_cast_155", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_155", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "permute_187", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_409", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_483", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_187", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "alias_default_484", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_483", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_484", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "einsum_default_119", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_156", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "dtype_cast_156", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_156", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "permute_188", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_188", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "alias_default_485", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_483", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_485", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "einsum_default_120", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_157", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "dtype_cast_157", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_157", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "permute_189", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_189", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "alias_default_486", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_483", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_486", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "einsum_default_121", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_119", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_431", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_432", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_433", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_431", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_416", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_416", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_434", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_434", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_complex_34", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_432", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_417", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_417", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_435", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_435", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_complex_35", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_436", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_436", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_487", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_487", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "mul_121", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_real_34", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_437", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_487", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "mul_122", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_122", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_real_35", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_438", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_437", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_418", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_438", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_419", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_419", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "unsqueeze_34", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "expand_34", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "clone_34", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_439", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_433", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "unsqueeze_35", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "expand_35", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "clone_35", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_440", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_418", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_190", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_439", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_191", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_440", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_192", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_190", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_488", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_191", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_489", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_192", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_490", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_488", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_489", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_490", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_17", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_153", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_154", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_17", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_159", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_17", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_160", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_153", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "alias_default_491", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_491", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_193", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_193", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_441", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_158", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "dtype_cast_158", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_158", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "permute_194", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_441", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_492", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_194", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "alias_default_493", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_492", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_493", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "einsum_default_122", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_479", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_122", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17", + "name": "add_86", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_163", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "dtype_cast_159", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_86", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17", + "name": "alias_default_494", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_494", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_422", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_422", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_496", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_496", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "pow_36", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mean_35", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "add_87", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "rsqrt_35", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_497", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_496", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_497", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_123", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_159", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_495", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_123", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_495", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_124", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_423", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_159", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "dtype_cast_160", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_160", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "permute_195", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_423", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_498", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_195", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "alias_default_499", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_498", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_499", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "einsum_default_123", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_123", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "alias_default_500", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_500", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "convert_element_type_426", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_426", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_501", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_501", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "neg_17", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "exp_17", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "add_88", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_501", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_88", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "div_17", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "convert_element_type_427", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_161", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "dtype_cast_161", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_161", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "permute_196", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_196", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "alias_default_503", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_498", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_503", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "einsum_default_124", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_427", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_502", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_124", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "alias_default_504", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_502", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_504", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_125", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_160", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "dtype_cast_162", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_162", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "permute_197", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_125", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_505", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_197", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "alias_default_506", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_505", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_506", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "einsum_default_125", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_494", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_125", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17", + "name": "add_89", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_171", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "dtype_cast_163", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_89", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17", + "name": "alias_default_507", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_507", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_432", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_432", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_509", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_509", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "pow_37", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mean_36", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "add_90", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "rsqrt_36", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_510", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_509", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_510", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_126", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_163", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_508", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_508", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_127", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_433", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_164", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "dtype_cast_164", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_164", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "permute_198", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_433", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_511", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_198", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "alias_default_512", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_511", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_512", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "einsum_default_126", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_165", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "dtype_cast_165", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_165", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "permute_199", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_199", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "alias_default_513", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_511", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_513", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "einsum_default_127", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_166", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "dtype_cast_166", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_166", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "permute_200", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_200", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "alias_default_514", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_511", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_514", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "einsum_default_128", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_126", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_456", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_127", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_457", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_128", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_458", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_456", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_440", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_440", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_459", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_459", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_complex_36", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_457", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_441", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_441", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_460", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_460", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_complex_37", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_461", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_461", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_515", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_515", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "mul_128", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_128", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_real_36", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_462", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_515", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "mul_129", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_129", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_real_37", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_463", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_462", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_442", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_463", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_443", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_443", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "unsqueeze_36", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "expand_36", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "clone_36", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_464", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_458", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "unsqueeze_37", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "expand_37", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "clone_37", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_465", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_442", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_201", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_464", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_202", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_465", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_203", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_201", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_516", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_517", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_203", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_518", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_516", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_517", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_518", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_18", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_162", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_163", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_18", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_168", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_18", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_169", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_162", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "alias_default_519", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_519", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_204", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_204", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_466", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_167", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "dtype_cast_167", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_167", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "permute_205", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_466", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_520", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_205", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "alias_default_521", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_520", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_521", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "einsum_default_129", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_507", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_129", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18", + "name": "add_91", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_172", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "dtype_cast_168", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18", + "name": "alias_default_522", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_522", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_446", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_446", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_524", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_524", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "pow_38", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mean_37", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "add_92", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_92", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "rsqrt_37", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_525", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_524", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_525", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_130", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_168", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_523", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_523", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_131", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_131", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_447", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_168", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "dtype_cast_169", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_169", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "permute_206", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_447", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_526", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_206", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "alias_default_527", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_526", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_527", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "einsum_default_130", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_130", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "alias_default_528", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_528", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "convert_element_type_450", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_450", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_529", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_529", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "neg_18", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "exp_18", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "add_93", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_529", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_93", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "div_18", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "convert_element_type_451", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_170", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "dtype_cast_170", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_170", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "permute_207", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_207", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "alias_default_531", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_526", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_531", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "einsum_default_131", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_451", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_530", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_131", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "alias_default_532", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_530", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_532", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_132", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_169", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "dtype_cast_171", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_171", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "permute_208", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_132", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_533", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_208", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "alias_default_534", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_533", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_534", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "einsum_default_132", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_522", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_132", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18", + "name": "add_94", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_180", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "dtype_cast_172", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_94", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18", + "name": "alias_default_535", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_535", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_456", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_456", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_537", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_537", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "pow_39", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mean_38", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "add_95", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_95", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "rsqrt_38", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_538", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_537", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_538", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_133", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_172", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_536", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_133", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_536", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_134", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_134", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_457", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_173", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "dtype_cast_173", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_173", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "permute_209", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_457", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_539", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_209", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "alias_default_540", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_539", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_540", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "einsum_default_133", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_174", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "dtype_cast_174", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_174", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "permute_210", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_210", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "alias_default_541", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_539", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_541", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "einsum_default_134", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_175", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "dtype_cast_175", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_175", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "permute_211", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_211", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "alias_default_542", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_539", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_542", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "einsum_default_135", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_133", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_481", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_134", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_482", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_135", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_483", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_481", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_464", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_464", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_484", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_484", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_complex_38", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_482", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_465", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_465", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_485", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_485", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_complex_39", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_486", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_486", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_543", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_543", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "mul_135", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_135", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_real_38", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_487", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_543", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "mul_136", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_136", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_real_39", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_488", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_487", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_466", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_488", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_467", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_467", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "unsqueeze_38", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "expand_38", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "clone_38", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_489", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_483", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "unsqueeze_39", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "expand_39", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "clone_39", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_490", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_466", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_212", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_489", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_213", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_490", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_214", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_212", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_544", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_213", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_545", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_546", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_544", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_545", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_546", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_19", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_171", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_172", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_19", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_177", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_19", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_178", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "alias_default_547", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_547", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_215", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_215", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_491", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_176", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "dtype_cast_176", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_176", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "permute_216", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_491", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_548", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_216", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "alias_default_549", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_548", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_549", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "einsum_default_136", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_535", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_136", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19", + "name": "add_96", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_181", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "dtype_cast_177", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19", + "name": "alias_default_550", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_550", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_470", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_470", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_552", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_552", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "pow_40", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mean_39", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "add_97", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_97", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "rsqrt_39", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_553", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_552", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_553", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_137", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_177", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_551", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_137", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_551", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_138", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_138", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_471", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_177", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "dtype_cast_178", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_178", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "permute_217", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_471", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_554", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_217", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "alias_default_555", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_554", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_555", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "einsum_default_137", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "alias_default_556", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_556", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "convert_element_type_474", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_474", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_557", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_557", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "neg_19", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "exp_19", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "add_98", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_557", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_98", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "div_19", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "convert_element_type_475", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_179", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "dtype_cast_179", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_179", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "permute_218", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_218", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "alias_default_559", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_554", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_559", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "einsum_default_138", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_475", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_558", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "alias_default_560", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_558", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_560", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_139", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_178", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "dtype_cast_180", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_180", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "permute_219", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_139", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_561", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_219", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "alias_default_562", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_561", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_562", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "einsum_default_139", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_550", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_139", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19", + "name": "add_99", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_189", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "dtype_cast_181", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_99", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19", + "name": "alias_default_563", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_563", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_480", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_480", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_565", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_565", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "pow_41", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mean_40", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "add_100", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_100", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "rsqrt_40", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_566", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_565", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_566", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_140", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_181", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_564", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_140", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_564", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_141", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_481", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_182", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "dtype_cast_182", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_182", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "permute_220", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_481", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_567", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_220", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "alias_default_568", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_567", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_568", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "einsum_default_140", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_183", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "dtype_cast_183", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_183", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "permute_221", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_221", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "alias_default_569", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_567", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_569", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "einsum_default_141", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_184", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "dtype_cast_184", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_184", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "permute_222", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_222", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "alias_default_570", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_567", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_570", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "einsum_default_142", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_140", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_506", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_141", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_507", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_142", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_508", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_506", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_488", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_488", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_509", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_509", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_complex_40", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_507", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_489", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_489", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_510", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_510", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_complex_41", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_511", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_511", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_571", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_571", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "mul_142", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_142", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_real_40", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_512", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_571", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "mul_143", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_143", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_real_41", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_513", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_512", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_490", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_513", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_491", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_491", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "unsqueeze_40", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "expand_40", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "clone_40", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_514", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_508", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "unsqueeze_41", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "expand_41", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "clone_41", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_515", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_490", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_223", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_514", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_224", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_515", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_225", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_223", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_572", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_224", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_573", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_225", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_574", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_572", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_573", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_574", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_20", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_180", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_181", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_20", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_186", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_20", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_187", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_180", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "alias_default_575", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_575", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_226", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_226", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_516", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_185", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "dtype_cast_185", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_185", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "permute_227", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_516", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_576", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_227", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "alias_default_577", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_576", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_577", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "einsum_default_143", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_563", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_143", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20", + "name": "add_101", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_190", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "dtype_cast_186", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_101", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20", + "name": "alias_default_578", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_578", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_494", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_494", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_580", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_580", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "pow_42", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mean_41", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "add_102", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "rsqrt_41", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_581", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_580", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_581", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_144", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_186", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_579", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_144", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_579", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_145", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_495", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_186", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "dtype_cast_187", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_187", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "permute_228", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_495", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_582", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_228", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "alias_default_583", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_582", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_583", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "einsum_default_144", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_144", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "alias_default_584", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_584", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "convert_element_type_498", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_498", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_585", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_585", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "neg_20", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "exp_20", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "add_103", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_585", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "div_20", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "convert_element_type_499", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_188", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "dtype_cast_188", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_188", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "permute_229", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_229", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "alias_default_587", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_582", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_587", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "einsum_default_145", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_499", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_586", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_145", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "alias_default_588", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_586", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_588", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_146", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_187", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "dtype_cast_189", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_189", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "permute_230", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_146", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_589", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_230", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "alias_default_590", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_589", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_590", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "einsum_default_146", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_578", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_146", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20", + "name": "add_104", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_198", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "dtype_cast_190", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_104", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20", + "name": "alias_default_591", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_591", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_504", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_504", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_593", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_593", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "pow_43", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mean_42", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "add_105", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_105", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "rsqrt_42", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_594", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_593", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_594", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_147", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_190", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_592", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_592", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_148", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_148", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_505", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_191", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "dtype_cast_191", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_191", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "permute_231", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_505", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_595", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_231", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "alias_default_596", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_595", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_596", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "einsum_default_147", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_192", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "dtype_cast_192", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_192", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "permute_232", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_232", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "alias_default_597", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_595", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_597", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "einsum_default_148", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_193", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "dtype_cast_193", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_193", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "permute_233", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_233", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "alias_default_598", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_595", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_598", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "einsum_default_149", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_147", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_531", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_148", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_532", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_533", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_531", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_512", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_512", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_534", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_534", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_complex_42", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_532", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_513", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_513", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_535", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_535", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_complex_43", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_536", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_536", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_599", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_599", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "mul_149", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_real_42", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_537", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_599", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "mul_150", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_150", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_real_43", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_538", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_537", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_514", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_538", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_515", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_515", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "unsqueeze_42", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "expand_42", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "clone_42", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_539", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_533", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "unsqueeze_43", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "expand_43", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "clone_43", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_540", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_514", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_234", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_539", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_235", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_540", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_236", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_234", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_600", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_235", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_601", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_602", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_600", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_601", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_602", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_21", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_189", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_190", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_21", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_195", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_21", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_196", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "alias_default_603", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_603", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_237", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_237", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_541", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_194", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "dtype_cast_194", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_194", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "permute_238", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_541", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_604", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_238", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "alias_default_605", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_604", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_605", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "einsum_default_150", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_591", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_150", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21", + "name": "add_106", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_199", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "dtype_cast_195", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_106", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21", + "name": "alias_default_606", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_606", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_518", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_518", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_608", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_608", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "pow_44", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mean_43", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "add_107", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_107", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "rsqrt_43", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_609", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_608", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_609", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_151", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_195", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_607", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_151", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_607", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_152", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_152", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_519", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_195", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "dtype_cast_196", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_196", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "permute_239", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_519", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_610", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_239", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "alias_default_611", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_610", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_611", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "einsum_default_151", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_151", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "alias_default_612", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_612", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "convert_element_type_522", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_522", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_613", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_613", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "neg_21", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "exp_21", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "add_108", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_613", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "div_21", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "convert_element_type_523", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_197", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "dtype_cast_197", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_197", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "permute_240", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_240", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "alias_default_615", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_610", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_615", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "einsum_default_152", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_523", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_614", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_152", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "alias_default_616", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_614", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_616", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_153", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_196", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "dtype_cast_198", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_198", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "permute_241", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_153", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_617", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_241", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "alias_default_618", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_617", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_618", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "einsum_default_153", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_606", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_153", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21", + "name": "add_109", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_207", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "dtype_cast_199", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21", + "name": "alias_default_619", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_619", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_528", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_528", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_621", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_621", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "pow_45", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mean_44", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "add_110", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_110", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "rsqrt_44", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_622", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_621", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_622", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_154", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_199", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_620", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_620", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_155", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_155", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_529", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_200", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "dtype_cast_200", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_200", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "permute_242", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_529", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_623", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_242", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "alias_default_624", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_623", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_624", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "einsum_default_154", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_201", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "dtype_cast_201", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_201", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "permute_243", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_243", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "alias_default_625", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_623", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_625", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "einsum_default_155", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_202", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "dtype_cast_202", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_202", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "permute_244", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_244", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "alias_default_626", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_623", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_626", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "einsum_default_156", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_154", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_556", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_155", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_557", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_156", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_558", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_556", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_536", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_536", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_559", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_559", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_complex_44", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_557", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_537", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_537", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_560", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_560", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_complex_45", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_561", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_561", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_627", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_627", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "mul_156", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_156", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_real_44", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_562", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_627", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "mul_157", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_157", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_real_45", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_563", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_562", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_538", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_563", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_539", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_539", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "unsqueeze_44", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "expand_44", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "clone_44", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_564", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_558", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "unsqueeze_45", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "expand_45", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "clone_45", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_565", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_538", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_245", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_564", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_246", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_565", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_247", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_245", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_628", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_246", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_629", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_247", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_630", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_628", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_629", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_630", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_22", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_198", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_199", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_22", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_204", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_22", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_205", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_198", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "alias_default_631", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_631", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_248", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_248", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_566", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_203", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "dtype_cast_203", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_203", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "permute_249", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_566", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_632", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_249", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "alias_default_633", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_632", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_633", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "einsum_default_157", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_619", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22", + "name": "add_111", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_208", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "dtype_cast_204", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_111", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22", + "name": "alias_default_634", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_634", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_542", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_542", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_636", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_636", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "pow_46", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mean_45", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "add_112", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_112", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "rsqrt_45", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_637", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_636", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_637", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_158", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_204", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_635", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_158", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_635", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_159", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_159", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_543", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_204", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "dtype_cast_205", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_205", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "permute_250", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_543", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_638", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_250", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "alias_default_639", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_638", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_639", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "einsum_default_158", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_158", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "alias_default_640", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_640", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "convert_element_type_546", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_546", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_641", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_641", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "neg_22", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "exp_22", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "add_113", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_641", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "div_22", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "convert_element_type_547", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_206", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "dtype_cast_206", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_206", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "permute_251", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_251", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "alias_default_643", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_638", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_643", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "einsum_default_159", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_547", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_642", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_159", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "alias_default_644", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_642", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_644", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_160", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_205", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "dtype_cast_207", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_207", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "permute_252", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_160", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_645", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_252", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "alias_default_646", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_645", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_646", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "einsum_default_160", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_634", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_160", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22", + "name": "add_114", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_216", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "dtype_cast_208", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_114", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22", + "name": "alias_default_647", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_647", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_552", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_552", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_649", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_649", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "pow_47", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mean_46", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "add_115", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "rsqrt_46", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_650", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_649", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_650", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_161", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_208", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_648", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_161", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_648", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_162", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_162", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_553", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_209", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "dtype_cast_209", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_209", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "permute_253", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_553", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_651", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_253", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "alias_default_652", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_651", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_652", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "einsum_default_161", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_210", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "dtype_cast_210", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_210", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "permute_254", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_254", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "alias_default_653", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_651", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_653", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "einsum_default_162", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_211", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "dtype_cast_211", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_211", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "permute_255", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_255", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "alias_default_654", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_651", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_654", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "einsum_default_163", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_161", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_581", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_162", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_582", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_163", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_583", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_581", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_560", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_560", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_584", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_584", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_complex_46", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_582", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_561", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_561", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_585", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_585", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_complex_47", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_586", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_586", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_655", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_655", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "mul_163", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_163", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_real_46", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_587", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_655", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "mul_164", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_164", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_real_47", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_588", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_587", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_562", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_588", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_563", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_563", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "unsqueeze_46", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "expand_46", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "clone_46", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_589", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_583", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "unsqueeze_47", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "expand_47", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "clone_47", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_590", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_562", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_256", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_589", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_257", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_590", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_258", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_256", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_656", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_257", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_657", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_658", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_656", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_657", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_658", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_23", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_207", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_208", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_23", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_213", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_23", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_214", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_207", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "alias_default_659", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_659", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_259", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_259", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_591", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_212", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "dtype_cast_212", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_212", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "permute_260", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_591", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_660", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_260", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "alias_default_661", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_660", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_661", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "einsum_default_164", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_647", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_164", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23", + "name": "add_116", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_217", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "dtype_cast_213", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_116", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23", + "name": "alias_default_662", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_662", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_566", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_566", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_664", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_664", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "pow_48", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mean_47", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "add_117", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_117", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "rsqrt_47", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_665", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_664", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_665", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_165", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_213", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_663", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_165", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_663", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_166", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_166", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_567", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_213", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "dtype_cast_214", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_214", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "permute_261", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_567", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_666", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_261", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "alias_default_667", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_666", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_667", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "einsum_default_165", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "alias_default_668", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_668", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "convert_element_type_570", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_570", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_669", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_669", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "neg_23", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "exp_23", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "add_118", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_669", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_118", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "div_23", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "convert_element_type_571", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_215", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "dtype_cast_215", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_215", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "permute_262", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_262", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "alias_default_671", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_666", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_671", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "einsum_default_166", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_571", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_670", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_166", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "alias_default_672", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_670", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_672", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_167", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "dtype_cast_216", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_216", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "permute_263", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_167", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_673", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_263", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "alias_default_674", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_673", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_674", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "einsum_default_167", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_662", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_167", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23", + "name": "add_119", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_225", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "dtype_cast_217", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_119", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23", + "name": "alias_default_675", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_675", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_576", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_576", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_677", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_677", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "pow_49", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mean_48", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "add_120", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_120", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "rsqrt_48", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_678", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_677", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_678", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_168", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_217", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_676", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_168", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_676", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_169", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_169", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_577", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_218", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "dtype_cast_218", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_218", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "permute_264", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_577", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_679", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_264", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "alias_default_680", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_679", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_680", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "einsum_default_168", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_219", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "dtype_cast_219", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_219", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "permute_265", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_265", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "alias_default_681", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_679", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_681", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "einsum_default_169", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_220", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "dtype_cast_220", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_220", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "permute_266", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_266", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "alias_default_682", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_679", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_682", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "einsum_default_170", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_606", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_169", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_607", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_170", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_608", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_606", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_584", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_584", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_609", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_609", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_complex_48", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_607", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_585", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_585", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_610", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_610", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_complex_49", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_611", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_611", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_683", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_683", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "mul_170", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_170", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_real_48", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_612", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_683", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "mul_171", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_171", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_real_49", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_613", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_612", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_586", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_613", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_587", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_587", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "unsqueeze_48", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "expand_48", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "clone_48", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_614", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_608", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "unsqueeze_49", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "expand_49", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "clone_49", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_615", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_586", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_267", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_614", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_268", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_615", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_269", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_267", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_684", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_685", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_686", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_684", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_685", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_686", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_24", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_216", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_217", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_24", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_222", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_24", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_223", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_216", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "alias_default_687", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_687", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_270", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_270", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_616", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_221", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "dtype_cast_221", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_221", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "permute_271", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_616", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_688", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_271", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "alias_default_689", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_688", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_689", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "einsum_default_171", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_675", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24", + "name": "add_121", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_226", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "dtype_cast_222", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_121", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24", + "name": "alias_default_690", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_690", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_590", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_590", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_692", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_692", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "pow_50", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mean_49", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "add_122", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_122", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "rsqrt_49", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_693", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_692", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_693", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_172", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_222", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_691", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_691", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_173", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_591", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_222", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "dtype_cast_223", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_223", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "permute_272", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_591", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_694", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_272", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "alias_default_695", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_694", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_695", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "einsum_default_172", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_172", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "alias_default_696", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_696", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "convert_element_type_594", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_594", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_697", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_697", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "neg_24", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "exp_24", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "add_123", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_697", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_123", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "div_24", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "convert_element_type_595", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_224", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "dtype_cast_224", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_224", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "permute_273", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_273", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "alias_default_699", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_694", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_699", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "einsum_default_173", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_595", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_698", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_173", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "alias_default_700", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_698", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_700", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_174", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_223", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "dtype_cast_225", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_225", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "permute_274", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_174", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_701", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_274", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "alias_default_702", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_701", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_702", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "einsum_default_174", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_690", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_174", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24", + "name": "add_124", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_234", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "dtype_cast_226", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24", + "name": "alias_default_703", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_703", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_600", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_600", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_705", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_705", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "pow_51", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mean_50", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "add_125", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "rsqrt_50", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_706", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_705", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_706", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_175", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_226", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_704", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_175", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_704", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_176", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_176", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_601", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_227", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "dtype_cast_227", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_227", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "permute_275", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_601", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_707", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_275", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "alias_default_708", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_707", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_708", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "einsum_default_175", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_228", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "dtype_cast_228", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_228", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "permute_276", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_276", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "alias_default_709", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_707", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_709", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "einsum_default_176", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_229", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "dtype_cast_229", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_229", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "permute_277", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_277", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "alias_default_710", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_707", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_710", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "einsum_default_177", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_175", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_631", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_176", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_632", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_177", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_633", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_631", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_608", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_608", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_634", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_634", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_complex_50", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_632", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_609", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_609", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_635", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_635", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_complex_51", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_636", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_636", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_711", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_711", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "mul_177", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_177", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_real_50", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_637", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_711", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "mul_178", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_178", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_real_51", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_638", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_637", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_610", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_638", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_611", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_611", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "unsqueeze_50", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "expand_50", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "clone_50", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_639", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_633", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "unsqueeze_51", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "expand_51", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "clone_51", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_640", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_610", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_278", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_639", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_279", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_640", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_280", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_278", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_712", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_279", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_713", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_280", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_714", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_712", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_713", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_714", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_25", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_225", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_226", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_25", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_231", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_25", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_232", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_225", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "alias_default_715", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_715", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_281", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_281", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_641", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_230", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "dtype_cast_230", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_230", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "permute_282", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_641", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_716", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_282", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "alias_default_717", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_716", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_717", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "einsum_default_178", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_703", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_178", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25", + "name": "add_126", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_235", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "dtype_cast_231", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25", + "name": "alias_default_718", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_718", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_614", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_614", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_720", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_720", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "pow_52", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mean_51", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "add_127", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "rsqrt_51", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_721", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_720", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_721", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_179", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_231", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_719", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_179", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_719", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_180", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_180", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_615", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_231", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "dtype_cast_232", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_232", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "permute_283", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_615", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_722", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_283", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "alias_default_723", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_722", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_723", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "einsum_default_179", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_179", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "alias_default_724", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_724", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "convert_element_type_618", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_618", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_725", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_725", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "neg_25", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "exp_25", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "add_128", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_725", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_128", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "div_25", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "convert_element_type_619", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_233", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "dtype_cast_233", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_233", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "permute_284", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_284", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "alias_default_727", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_722", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_727", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "einsum_default_180", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_619", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_726", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_180", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "alias_default_728", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_726", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_728", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_181", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_232", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "dtype_cast_234", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_234", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "permute_285", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_181", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_729", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_285", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "alias_default_730", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_729", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_730", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "einsum_default_181", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_718", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_181", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25", + "name": "add_129", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_243", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "dtype_cast_235", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_129", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25", + "name": "alias_default_731", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_731", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_624", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_624", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_733", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_733", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "pow_53", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_53", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mean_52", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "add_130", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "rsqrt_52", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_734", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_733", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_734", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_182", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_235", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_732", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_182", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_732", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_183", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_183", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_625", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_236", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "dtype_cast_236", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_236", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "permute_286", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_625", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_735", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_286", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "alias_default_736", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_735", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_736", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "einsum_default_182", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_237", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "dtype_cast_237", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_237", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "permute_287", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_287", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "alias_default_737", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_735", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_737", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "einsum_default_183", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_238", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "dtype_cast_238", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_238", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "permute_288", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_288", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "alias_default_738", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_735", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_738", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "einsum_default_184", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_182", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_656", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_183", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_657", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_184", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_658", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_656", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_632", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_632", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_659", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_659", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_complex_52", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_657", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_633", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_633", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_660", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_660", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_complex_53", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_661", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_661", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_739", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_739", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "mul_184", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_184", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_real_52", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_662", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_739", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "mul_185", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_185", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_real_53", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_663", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_662", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_634", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_663", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_635", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_635", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "unsqueeze_52", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "expand_52", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "clone_52", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_664", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_658", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "unsqueeze_53", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "expand_53", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "clone_53", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_665", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_634", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_289", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_664", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_290", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_665", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_291", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_289", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_740", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_290", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_741", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_742", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_740", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_741", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_742", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_26", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_234", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_235", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_26", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_240", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_26", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_241", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_234", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "alias_default_743", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_743", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_292", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_292", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_666", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_239", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "dtype_cast_239", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_239", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "permute_293", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_666", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_744", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_293", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "alias_default_745", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_744", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_745", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "einsum_default_185", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_731", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_185", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26", + "name": "add_131", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_244", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "dtype_cast_240", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_131", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26", + "name": "alias_default_746", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_746", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_638", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_638", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_748", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_748", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "pow_54", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mean_53", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_53", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "add_132", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_132", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "rsqrt_53", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_53", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_749", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_748", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_749", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_186", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_240", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_747", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_747", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_187", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_187", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_639", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_240", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "dtype_cast_241", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_241", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "permute_294", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_639", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_750", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_294", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "alias_default_751", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_750", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_751", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "einsum_default_186", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_186", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "alias_default_752", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_752", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "convert_element_type_642", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_642", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_753", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_753", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "neg_26", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "exp_26", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "add_133", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_753", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_133", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "div_26", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "convert_element_type_643", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_242", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "dtype_cast_242", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_242", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "permute_295", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_295", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "alias_default_755", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_750", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_755", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "einsum_default_187", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_643", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_754", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_187", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "alias_default_756", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_754", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_756", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_188", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_241", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "dtype_cast_243", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_243", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "permute_296", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_188", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_757", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_296", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "alias_default_758", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_757", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_758", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "einsum_default_188", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_746", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_188", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26", + "name": "add_134", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_252", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "dtype_cast_244", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_134", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26", + "name": "alias_default_759", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_759", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_648", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_648", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_761", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_761", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "pow_55", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mean_54", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "add_135", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_135", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "rsqrt_54", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_762", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_761", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_762", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_189", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_244", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_760", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_760", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_190", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_190", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_649", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_245", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "dtype_cast_245", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_245", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "permute_297", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_649", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_763", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_297", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "alias_default_764", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_763", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_764", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "einsum_default_189", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_246", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "dtype_cast_246", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_246", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "permute_298", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_298", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "alias_default_765", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_763", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_765", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "einsum_default_190", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_247", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "dtype_cast_247", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_247", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "permute_299", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_299", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "alias_default_766", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_763", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_766", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "einsum_default_191", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_189", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_681", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_190", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_682", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_191", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_683", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_681", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_656", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_656", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_684", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_684", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_complex_54", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_682", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_657", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_657", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_685", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_685", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_complex_55", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_686", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_686", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_767", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_767", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "mul_191", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_191", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_real_54", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_687", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_767", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "mul_192", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_192", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_real_55", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_688", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_687", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_658", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_688", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_659", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_659", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "unsqueeze_54", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "expand_54", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "clone_54", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_689", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_683", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "unsqueeze_55", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "expand_55", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "clone_55", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_690", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_658", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_300", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_689", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_301", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_690", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_302", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_768", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_769", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_302", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_770", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_768", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_769", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_770", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_27", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_243", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_244", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_27", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_249", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_27", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_250", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_243", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "alias_default_771", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_771", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_303", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_303", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_691", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_248", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "dtype_cast_248", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_248", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "permute_304", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_691", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_772", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_304", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "alias_default_773", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_772", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_773", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "einsum_default_192", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_759", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_192", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27", + "name": "add_136", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_253", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "dtype_cast_249", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_136", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27", + "name": "alias_default_774", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_774", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_662", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_662", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_776", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_776", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "pow_56", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mean_55", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "add_137", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_137", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "rsqrt_55", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_777", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_776", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_777", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_193", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_249", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_775", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_193", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_775", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_194", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_194", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_663", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_249", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "dtype_cast_250", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_250", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "permute_305", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_663", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_778", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_305", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "alias_default_779", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_778", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_779", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "einsum_default_193", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_193", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "alias_default_780", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_780", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "convert_element_type_666", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_666", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_781", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_781", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "neg_27", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "exp_27", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "add_138", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_781", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "div_27", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "convert_element_type_667", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_251", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "dtype_cast_251", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_251", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "permute_306", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_306", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "alias_default_783", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_778", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_783", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "einsum_default_194", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_667", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_782", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_194", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "alias_default_784", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_782", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_784", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_195", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_250", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "dtype_cast_252", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_252", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "permute_307", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_195", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_785", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_307", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "alias_default_786", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_785", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_786", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "einsum_default_195", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_774", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_195", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27", + "name": "add_139", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_261", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "dtype_cast_253", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_139", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27", + "name": "alias_default_787", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_787", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_672", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_672", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_789", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_789", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "pow_57", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mean_56", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "add_140", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_140", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "rsqrt_56", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_790", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_789", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_790", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_196", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_253", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_788", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_196", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_788", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_197", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_197", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_673", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_254", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "dtype_cast_254", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_254", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "permute_308", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_673", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_791", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_308", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "alias_default_792", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_791", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_792", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "einsum_default_196", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_255", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "dtype_cast_255", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_255", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "permute_309", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_309", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "alias_default_793", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_791", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_793", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "einsum_default_197", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_256", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "dtype_cast_256", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_256", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "permute_310", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_310", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "alias_default_794", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_791", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_794", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "einsum_default_198", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_196", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_706", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_707", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_198", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_708", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_706", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_680", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_680", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_709", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_709", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_complex_56", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_707", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_681", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_681", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_710", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_710", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_complex_57", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_711", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_711", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_795", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_795", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "mul_198", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_198", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_real_56", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_712", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_795", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "mul_199", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_199", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_real_57", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_713", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_712", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_682", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_713", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_683", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_683", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "unsqueeze_56", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "expand_56", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "clone_56", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_714", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_708", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "unsqueeze_57", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "expand_57", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "clone_57", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_715", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_682", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_311", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_714", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_312", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_715", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_313", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_796", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_312", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_797", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_313", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_798", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_796", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_797", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_798", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_28", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_252", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_253", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_28", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_258", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_28", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_259", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_252", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "alias_default_799", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_799", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_314", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_314", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_716", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_257", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "dtype_cast_257", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_257", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "permute_315", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_716", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_800", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_315", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "alias_default_801", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_800", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_801", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "einsum_default_199", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_787", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_199", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28", + "name": "add_141", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_262", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "dtype_cast_258", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28", + "name": "alias_default_802", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_802", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_686", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_686", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_804", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_804", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "pow_58", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_58", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mean_57", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "add_142", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_142", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "rsqrt_57", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_805", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_804", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_805", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_200", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_258", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_803", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_200", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_803", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_201", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_201", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_687", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_258", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "dtype_cast_259", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_259", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "permute_316", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_687", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_806", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_316", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "alias_default_807", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_806", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_807", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "einsum_default_200", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_200", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "alias_default_808", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_808", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "convert_element_type_690", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_690", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_809", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_809", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "neg_28", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "exp_28", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "add_143", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_809", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_143", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "div_28", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "convert_element_type_691", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_260", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "dtype_cast_260", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_260", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "permute_317", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_317", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "alias_default_811", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_806", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_811", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "einsum_default_201", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_691", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_810", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_201", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "alias_default_812", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_810", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_812", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_202", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_259", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "dtype_cast_261", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_261", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "permute_318", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_202", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_813", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_318", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "alias_default_814", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_813", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_814", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "einsum_default_202", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_802", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_202", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28", + "name": "add_144", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_270", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "dtype_cast_262", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_144", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28", + "name": "alias_default_815", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_815", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_696", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_696", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_817", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_817", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "pow_59", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mean_58", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_58", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "add_145", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "rsqrt_58", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_58", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_818", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_817", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_818", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_203", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_262", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_816", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_203", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_816", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_204", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_204", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_697", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_263", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "dtype_cast_263", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_263", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "permute_319", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_697", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_819", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_319", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "alias_default_820", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_819", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_820", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "einsum_default_203", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_264", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "dtype_cast_264", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_264", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "permute_320", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_320", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "alias_default_821", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_819", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_821", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "einsum_default_204", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_265", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "dtype_cast_265", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_265", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "permute_321", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_321", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "alias_default_822", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_819", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_822", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "einsum_default_205", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_203", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_731", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_204", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_732", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_205", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_733", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_731", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_704", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_704", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_734", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_734", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_complex_58", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_732", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_705", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_705", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_735", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_735", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_complex_59", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_736", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_736", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_823", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_823", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "mul_205", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_205", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_real_58", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_737", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_823", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "mul_206", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_206", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_real_59", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_738", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_737", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_706", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_738", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_707", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_707", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "unsqueeze_58", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "expand_58", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "clone_58", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_739", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_733", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "unsqueeze_59", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "expand_59", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "clone_59", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_740", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_706", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_322", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_739", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_323", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_740", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_324", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_824", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_825", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_324", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_826", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_824", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_825", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_826", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_29", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_261", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_262", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_29", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_267", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_29", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_268", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_261", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "alias_default_827", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_827", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_325", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_325", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_741", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_266", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "dtype_cast_266", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_266", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "permute_326", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_741", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_828", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_326", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "alias_default_829", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_828", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_829", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "einsum_default_206", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_815", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_206", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29", + "name": "add_146", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_271", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "dtype_cast_267", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29", + "name": "alias_default_830", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_830", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_710", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_710", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_832", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_832", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "pow_60", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mean_59", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "add_147", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "rsqrt_59", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_833", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_832", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_833", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_207", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_267", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_831", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_207", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_831", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_208", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_208", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_711", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_267", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "dtype_cast_268", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_268", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "permute_327", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_711", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_834", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_327", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "alias_default_835", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_834", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_835", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "einsum_default_207", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_207", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "alias_default_836", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_836", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "convert_element_type_714", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_714", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_837", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_837", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "neg_29", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "exp_29", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "add_148", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_837", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_148", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "div_29", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "convert_element_type_715", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_269", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "dtype_cast_269", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_269", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "permute_328", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_328", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "alias_default_839", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_834", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_839", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "einsum_default_208", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_715", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_838", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_208", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "alias_default_840", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_838", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_840", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_209", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "dtype_cast_270", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_270", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "permute_329", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_209", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_841", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_329", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "alias_default_842", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_841", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_842", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "einsum_default_209", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_830", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_209", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29", + "name": "add_149", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_279", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "dtype_cast_271", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_149", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29", + "name": "alias_default_843", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_843", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_720", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_720", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_845", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_845", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "pow_61", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mean_60", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "add_150", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_150", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "rsqrt_60", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_846", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_845", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_846", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_210", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_271", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_844", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_210", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_844", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_211", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_211", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_721", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_272", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "dtype_cast_272", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_272", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "permute_330", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_721", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_847", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_330", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "alias_default_848", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_847", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_848", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "einsum_default_210", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_273", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "dtype_cast_273", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_273", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "permute_331", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_331", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "alias_default_849", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_847", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_849", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "einsum_default_211", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_274", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "dtype_cast_274", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_274", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "permute_332", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_332", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "alias_default_850", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_847", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_850", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "einsum_default_212", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_210", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_756", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_211", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_757", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_212", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_758", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_756", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_728", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_728", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_759", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_759", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_complex_60", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_757", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_729", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_729", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_760", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_760", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_complex_61", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_761", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_761", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_851", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_851", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "mul_212", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_212", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_real_60", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_762", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_851", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "mul_213", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_213", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_real_61", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_763", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_762", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_730", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_763", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_731", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_731", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "unsqueeze_60", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "expand_60", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "clone_60", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_764", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_758", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "unsqueeze_61", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "expand_61", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "clone_61", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_765", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_730", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_333", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_764", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_334", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_765", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_335", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_852", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_334", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_853", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_335", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_854", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_852", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_853", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_854", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_30", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_270", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_271", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_30", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_276", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_30", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_277", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_270", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "alias_default_855", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_855", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_336", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_336", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_766", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_275", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "dtype_cast_275", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_275", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "permute_337", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_766", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_856", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_337", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "alias_default_857", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_856", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_857", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "einsum_default_213", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_843", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_213", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30", + "name": "add_151", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_280", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "dtype_cast_276", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_151", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30", + "name": "alias_default_858", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_858", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_734", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_734", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_860", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_860", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "pow_62", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mean_61", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "add_152", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_152", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "rsqrt_61", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_861", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_860", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_861", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_214", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_276", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_859", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_859", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_215", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_215", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_735", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_276", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "dtype_cast_277", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_277", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "permute_338", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_735", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_862", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_338", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "alias_default_863", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_862", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_863", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "einsum_default_214", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_214", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "alias_default_864", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_864", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "convert_element_type_738", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_738", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_865", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_865", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "neg_30", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "exp_30", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "add_153", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_865", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_153", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "div_30", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "convert_element_type_739", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_278", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "dtype_cast_278", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_278", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "permute_339", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_339", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "alias_default_867", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_862", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_867", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "einsum_default_215", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_739", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_866", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_215", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "alias_default_868", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_866", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_868", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_216", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_277", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "dtype_cast_279", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_279", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "permute_340", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_216", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_869", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_340", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "alias_default_870", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_869", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_870", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "einsum_default_216", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_858", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_216", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30", + "name": "add_154", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 0, + "cluster_root": "dtype_cast_1", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_288", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "dtype_cast_280", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30", + "name": "alias_default_871", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 1, + "cluster_root": "convert_element_type", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_871", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_744", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 2, + "cluster_root": "alias_default_5", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_744", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_873", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 3, + "cluster_root": "pow_1", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_873", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "pow_63", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 4, + "cluster_root": "mean", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mean_62", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 5, + "cluster_root": "add", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "add_155", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 6, + "cluster_root": "rsqrt", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_155", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "rsqrt_62", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 7, + "cluster_root": "alias_default_6", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_874", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 8, + "cluster_root": "mul", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_873", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_874", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_217", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 9, + "cluster_root": "alias_default_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_280", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_872", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 10, + "cluster_root": "mul_1", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_217", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_872", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_218", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 11, + "cluster_root": "convert_element_type_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_218", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_745", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 12, + "cluster_root": "dtype_cast_2", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_281", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "dtype_cast_281", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 13, + "cluster_root": "permute", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 93.01059422750424, + "dst_placement": "RS(0)", + "name": "dtype_cast_281", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "permute_341", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 14, + "cluster_root": "alias_default_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_745", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_875", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 15, + "cluster_root": "alias_default_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_341", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "alias_default_876", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 16, + "cluster_root": "einsum_default", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_875", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_876", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "einsum_default_217", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 17, + "cluster_root": "dtype_cast_3", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_282", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "dtype_cast_282", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 18, + "cluster_root": "permute_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 75.93123841862722, + "dst_placement": "RR", + "name": "dtype_cast_282", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "permute_342", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 19, + "cluster_root": "alias_default_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_342", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "alias_default_877", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 20, + "cluster_root": "einsum_default_1", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_875", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_877", + "src_placement": "RR", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "einsum_default_218", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 21, + "cluster_root": "dtype_cast_4", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_283", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "dtype_cast_283", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 22, + "cluster_root": "permute_2", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 39.60264855687606, + "dst_placement": "RS(0)", + "name": "dtype_cast_283", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "permute_343", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 23, + "cluster_root": "alias_default_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_343", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "alias_default_878", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 24, + "cluster_root": "einsum_default_2", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_875", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_878", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "einsum_default_219", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 25, + "cluster_root": "view_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_217", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_781", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 26, + "cluster_root": "view_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_218", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_782", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 27, + "cluster_root": "view_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_219", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_783", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 28, + "cluster_root": "convert_element_type_8", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_781", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_752", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 29, + "cluster_root": "view_9", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_752", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_784", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 30, + "cluster_root": "view_as_complex", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_784", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_complex_62", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 31, + "cluster_root": "convert_element_type_9", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_782", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_753", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 32, + "cluster_root": "view_10", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_753", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_785", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 33, + "cluster_root": "view_as_complex_1", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_785", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_complex_63", + "op": "aten.view_as_complex.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 34, + "cluster_root": "view_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_786", + "op": "aten.view.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 35, + "cluster_root": "alias_default_11", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "view_786", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_879", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "return freqs_cis.view(*shape)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "reshape_for_broadcast", + "line": 183 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 36, + "cluster_root": "mul_2", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_879", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "mul_219", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 37, + "cluster_root": "view_as_real", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_219", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_real_62", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 38, + "cluster_root": "view_12", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_787", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 39, + "cluster_root": "mul_3", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_879", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "mul_220", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 40, + "cluster_root": "view_as_real_1", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_220", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_real_63", + "op": "aten.view_as_real.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 41, + "cluster_root": "view_13", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_788", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 42, + "cluster_root": "convert_element_type_10", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_787", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_754", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 43, + "cluster_root": "convert_element_type_11", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_788", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_755", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 44, + "cluster_root": "unsqueeze", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_755", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "unsqueeze_62", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 45, + "cluster_root": "expand", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "expand_62", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 46, + "cluster_root": "clone", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "clone_62", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 47, + "cluster_root": "view_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_789", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 48, + "cluster_root": "unsqueeze_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_783", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "unsqueeze_63", + "op": "aten.unsqueeze.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 49, + "cluster_root": "expand_1", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "unsqueeze_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "expand_63", + "op": "aten.expand.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 50, + "cluster_root": "clone_1", + "compute_cost": 26.027785181236673, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "expand_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "clone_63", + "op": "aten.clone.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 51, + "cluster_root": "view_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "clone_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_790", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 52, + "cluster_root": "permute_3", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_754", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_344", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 53, + "cluster_root": "permute_4", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_789", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_345", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 54, + "cluster_root": "permute_5", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_790", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_346", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 55, + "cluster_root": "alias_default_12", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_344", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_880", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 56, + "cluster_root": "alias_default_13", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_345", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_881", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 57, + "cluster_root": "alias_default_14", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_346", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_882", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 58, + "cluster_root": "_scaled_dot_product_flash_attention", + "compute_cost": 794.1005545110502, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_880", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_881", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_882", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_31", + "op": "aten._scaled_dot_product_flash_attention.default", + "phase": "forward", + "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 59, + "cluster_root": "getitem", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_279", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_280", + "op": "", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_31", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_285", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [ + 2 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "uint64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_scaled_dot_product_flash_attention_31", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_286", + "op": "", + "phase": "forward", + "placement": "RR", + "shape": [], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 60, + "cluster_root": "alias_default_15", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_279", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "alias_default_883", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 61, + "cluster_root": "permute_6", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_883", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_347", + "op": "aten.permute.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 62, + "cluster_root": "view_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "permute_347", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_791", + "op": "aten.view.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 63, + "cluster_root": "dtype_cast_5", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_284", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "dtype_cast_284", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 64, + "cluster_root": "permute_7", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 187.32495367450883, + "dst_placement": "RR", + "name": "dtype_cast_284", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "permute_348", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 65, + "cluster_root": "alias_default_16", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "view_791", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_884", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 66, + "cluster_root": "alias_default_17", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_348", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "alias_default_885", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 67, + "cluster_root": "einsum_default_3", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_884", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_885", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "einsum_default_220", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 68, + "cluster_root": "add_1", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_871", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_220", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31", + "name": "add_156", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 69, + "cluster_root": "dtype_cast_6", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_289", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "dtype_cast_285", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 70, + "cluster_root": "alias_default_18", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31", + "name": "alias_default_886", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 419 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 71, + "cluster_root": "convert_element_type_14", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_886", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_758", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 72, + "cluster_root": "alias_default_20", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_758", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_888", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 73, + "cluster_root": "pow_2", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_888", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "pow_64", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 74, + "cluster_root": "mean_1", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mean_63", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 75, + "cluster_root": "add_2", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "add_157", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 76, + "cluster_root": "rsqrt_1", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "rsqrt_63", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 77, + "cluster_root": "alias_default_21", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_889", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 78, + "cluster_root": "mul_4", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_888", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_889", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_221", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 79, + "cluster_root": "alias_default_19", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_285", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_887", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 80, + "cluster_root": "mul_5", + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_221", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_887", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_222", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 81, + "cluster_root": "convert_element_type_15", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_222", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_759", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 82, + "cluster_root": "dtype_cast_7", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_285", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "dtype_cast_286", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 83, + "cluster_root": "permute_8", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_286", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "permute_349", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 84, + "cluster_root": "alias_default_22", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_759", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_890", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 85, + "cluster_root": "alias_default_23", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_349", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "alias_default_891", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 86, + "cluster_root": "einsum_default_4", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_890", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_891", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "einsum_default_221", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 87, + "cluster_root": "alias_default_24", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_221", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "alias_default_892", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 88, + "cluster_root": "convert_element_type_18", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_892", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "convert_element_type_762", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 89, + "cluster_root": "alias_default_25", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_762", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_893", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 90, + "cluster_root": "neg", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_893", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "neg_31", + "op": "aten.neg.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 91, + "cluster_root": "exp", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "exp_31", + "op": "aten.exp.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 92, + "cluster_root": "add_3", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "add_158", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 93, + "cluster_root": "div", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_893", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_158", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "div_31", + "op": "aten.div.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 94, + "cluster_root": "convert_element_type_19", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "div_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "convert_element_type_763", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 95, + "cluster_root": "dtype_cast_8", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_287", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "dtype_cast_287", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 96, + "cluster_root": "permute_9", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(0)", + "name": "dtype_cast_287", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "permute_350", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 97, + "cluster_root": "alias_default_27", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_350", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "alias_default_895", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 98, + "cluster_root": "einsum_default_5", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_890", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_895", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "einsum_default_222", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 99, + "cluster_root": "alias_default_26", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_763", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_894", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 100, + "cluster_root": "alias_default_28", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_222", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "alias_default_896", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 101, + "cluster_root": "mul_6", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_894", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_896", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_223", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 102, + "cluster_root": "dtype_cast_9", + "compute_cost": 8.540367012593283, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "primals_286", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "dtype_cast_288", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 103, + "cluster_root": "permute_10", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 258.576, + "dst_placement": "RS(1)", + "name": "dtype_cast_288", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "permute_351", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 104, + "cluster_root": "alias_default_29", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_223", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_897", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 105, + "cluster_root": "alias_default_30", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_351", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "alias_default_898", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 106, + "cluster_root": "einsum_default_6", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_897", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_898", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "einsum_default_223", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 107, + "cluster_root": "add_4", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_886", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_223", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31", + "name": "add_159", + "op": "aten.add.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_290", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "dtype_cast_289", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 108, + "cluster_root": "alias_default_31", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_159", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31", + "name": "alias_default_899", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "out = h + self.feed_forward(self.ffn_norm(h))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 420 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_899", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_768", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_768", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_901", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_901", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "pow_65", + "op": "aten.pow.Tensor_Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "pow_65", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mean_64", + "op": "aten.mean.dim", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mean_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "add_160", + "op": "aten.add.Scalar", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_160", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "rsqrt_64", + "op": "aten.rsqrt.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "rsqrt_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_902", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_901", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_902", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_224", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 28.358260191421483, + "dst_placement": "RR", + "name": "dtype_cast_289", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_900", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 52.058747582344104, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_224", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_900", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_225", + "op": "aten.mul.Tensor", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_225", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_769", + "op": "prims.convert_element_type.default", + "phase": "forward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 76.40578345195063, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(0)", + "name": "primals_291", + "src_placement": "S(0)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "dtype_cast_290", + "op": "autoparallel.dtype_cast.default", + "phase": "forward", + "placement": "S(0)S(0)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 2081.296, + "dst_placement": "RS(0)", + "name": "dtype_cast_290", + "src_placement": "S(0)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].output", + "name": "permute_352", + "op": "aten.permute.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 128256 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_769", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_903", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_352", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "alias_default_904", + "op": "aten.alias.default", + "phase": "forward", + "placement": "RS(1)", + "shape": [ + 4096, + 128256 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 6216.318403281814, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_903", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_904", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "einsum_default_224", + "op": "aten.einsum.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 128256 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "alias_default_1420", + "op": "aten.alias.default", + "phase": "forward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 128256 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "tangents_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "alias_default_2", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 128256 + ], + "transition_cost": 0.0 + }, + { + "compute_cost": 6216.318403281814, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_903", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "einsum_default_225", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 128256 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_904", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "permute_355", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 6216.318403281814, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_355", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "einsum_default_226", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_225", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "permute_356", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "output = self.output(h) if self.output else h", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 545 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 305.6231338078025, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_356", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].output", + "name": "dtype_cast_291", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 4133.392, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_291", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].output", + "name": "alias_default_1711", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_226", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_776", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_899", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_777", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_900", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_778", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_776", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_905", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_905", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_778", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_226", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_777", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_902", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_227", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_226", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_906", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_907", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_907", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_906", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_228", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_228", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "sum_1", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_907", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "div_32", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_229", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_906", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_229", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "sub", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_902", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_230", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_905", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_907", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "mul_231", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_231", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "sum_2", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_230", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_779", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_2", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "convert_element_type_780", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_780", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].norm", + "name": "dtype_cast_292", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_292", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_1710", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "convert_element_type_779", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].norm", + "name": "alias_default_908", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_908", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_897", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "einsum_default_227", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_898", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "permute_359", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_908", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_359", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "einsum_default_228", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_227", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "permute_360", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_360", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "dtype_cast_293", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_293", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "alias_default_1706", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_228", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w2", + "name": "alias_default_909", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_909", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_894", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_232", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_909", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_896", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_233", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_232", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_910", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_910", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_890", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "einsum_default_229", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_895", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "permute_363", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_910", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_363", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "einsum_default_230", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_229", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "permute_364", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_364", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "dtype_cast_294", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_294", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w3", + "name": "alias_default_1707", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_233", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "convert_element_type_789", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_892", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "convert_element_type_790", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_790", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_911", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_911", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "neg_32", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "exp_32", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "add_161", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_161", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "reciprocal", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_234", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_234", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_912", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_789", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_912", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_235", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_912", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "sub_1", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_911", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_236", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_236", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "add_162", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_235", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_162", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "mul_237", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_237", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "convert_element_type_791", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_791", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward", + "name": "alias_default_913", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_913", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_890", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "einsum_default_231", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_891", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "permute_367", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_913", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_367", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "einsum_default_232", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_230", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_232", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31", + "name": "add_163", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_231", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "permute_368", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_368", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "dtype_cast_295", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_295", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.feed_forward.w1", + "name": "alias_default_1705", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_163", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_796", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_886", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_797", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_887", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_798", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_796", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_914", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_914", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_798", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_238", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_797", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_889", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_239", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_238", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_915", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_239", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_916", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_916", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_915", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_240", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_240", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "sum_3", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_916", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "div_33", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_241", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_915", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_241", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "sub_2", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_889", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_242", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_914", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_916", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "mul_243", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_243", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "sum_4", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_799", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_4", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "convert_element_type_800", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_908", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_799", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "name": "add_164", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_800", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "dtype_cast_296", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_296", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.ffn_norm", + "name": "alias_default_1709", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_164", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "alias_default_917", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_917", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_884", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "einsum_default_233", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_885", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "permute_371", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_917", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_371", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "einsum_default_234", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_233", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "permute_372", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_372", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "dtype_cast_297", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_297", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wo", + "name": "alias_default_1704", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_234", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_812", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_812", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_373", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_373", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_880", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_881", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_882", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_883", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_280", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_285", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_286", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_288", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_289", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.sdpa", + "name": "getitem_290", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_290", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_374", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_289", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_375", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_288", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "permute_376", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_374", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_813", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_813", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "sum_5", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "squeeze", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_375", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_814", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_814", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "sum_6", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "squeeze_1", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_805", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_376", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_806", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_805", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_815", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_815", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_complex_64", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_879", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "_conj", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "clone_70", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_64", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_70", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "mul_244", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_806", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_816", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_816", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_complex_65", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_879", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "_conj_1", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_1", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "clone_71", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_65", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_71", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "mul_245", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_244", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_real_64", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_64", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_817", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_817", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_807", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_245", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_as_real_65", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_65", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_818", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_818", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "convert_element_type_808", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_819", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_807", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_820", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_808", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "view_821", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_819", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_918", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_918", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_875", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "einsum_default_235", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_878", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "permute_379", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_918", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_379", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "einsum_default_236", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_235", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "permute_380", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_380", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "dtype_cast_298", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_298", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wv", + "name": "alias_default_1703", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_820", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_919", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_919", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_875", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "einsum_default_237", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_877", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "permute_383", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_919", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_383", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "einsum_default_238", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_238", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "add_165", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_237", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "permute_384", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_384", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "dtype_cast_299", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_299", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wk", + "name": "alias_default_1702", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_821", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention", + "name": "alias_default_920", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_920", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_875", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "einsum_default_239", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_876", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "permute_387", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_920", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_387", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "einsum_default_240", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_165", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_240", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31", + "name": "add_166", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_239", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "permute_388", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_388", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "dtype_cast_300", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_300", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention.wq", + "name": "alias_default_1701", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_166", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_821", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_871", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_822", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_872", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_823", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_821", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_921", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_921", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_823", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_246", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_822", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_874", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_247", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_246", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_922", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_247", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_923", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_923", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_922", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_248", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_248", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "sum_7", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_923", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "div_34", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_249", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_922", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_249", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "sub_3", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_874", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_250", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_921", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_923", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "mul_251", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_251", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "sum_8", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_250", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_824", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_8", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "convert_element_type_825", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_917", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_824", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "add_167", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_825", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "dtype_cast_301", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_301", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.31.attention_norm", + "name": "alias_default_1708", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_167", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "alias_default_924", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_924", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_869", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "einsum_default_241", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_870", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "permute_391", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_924", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_391", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "einsum_default_242", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_241", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "permute_392", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_392", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "dtype_cast_302", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_302", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "alias_default_1697", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_242", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w2", + "name": "alias_default_925", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_925", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_866", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_252", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_925", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_868", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_253", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_252", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_926", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_926", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_862", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "einsum_default_243", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_867", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "permute_395", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_926", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_395", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "einsum_default_244", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_243", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "permute_396", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_396", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "dtype_cast_303", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_303", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w3", + "name": "alias_default_1698", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_253", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "convert_element_type_834", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_864", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "convert_element_type_835", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_835", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_927", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_927", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "neg_33", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "exp_33", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "add_168", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "reciprocal_1", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_1", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_254", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_254", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_928", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_834", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_928", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_255", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_928", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "sub_4", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_927", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_256", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_256", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "add_169", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_255", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_169", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "mul_257", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_257", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "convert_element_type_836", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_836", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward", + "name": "alias_default_929", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_929", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_862", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "einsum_default_245", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_863", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "permute_399", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_929", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_399", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "einsum_default_246", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_244", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_246", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30", + "name": "add_170", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_245", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "permute_400", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_400", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "dtype_cast_304", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_304", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.feed_forward.w1", + "name": "alias_default_1696", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_170", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_841", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_858", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_842", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_859", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_843", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_841", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_930", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_930", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_843", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_258", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_842", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_861", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_259", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_931", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_259", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_932", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_932", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_931", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_260", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_260", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "sum_9", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_932", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "div_35", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_261", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_931", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_261", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "sub_5", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_861", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_262", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_930", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_932", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "mul_263", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_263", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "sum_10", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_262", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_844", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_10", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "convert_element_type_845", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_924", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_844", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "add_171", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_845", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "dtype_cast_305", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_305", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.ffn_norm", + "name": "alias_default_1700", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "alias_default_933", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_933", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_856", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "einsum_default_247", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_857", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "permute_403", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_933", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_403", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "einsum_default_248", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_247", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "permute_404", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_404", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "dtype_cast_306", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_306", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wo", + "name": "alias_default_1695", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_248", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_836", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_836", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_405", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_405", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_852", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_853", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_854", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_855", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_271", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_276", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_277", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_1", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_291", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_292", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.sdpa", + "name": "getitem_293", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_293", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_406", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_292", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_407", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "permute_408", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_406", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_837", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_837", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "sum_11", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "squeeze_2", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_407", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_838", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_838", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "sum_12", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "squeeze_3", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_850", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_408", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_851", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_850", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_839", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_839", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_complex_66", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_851", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "_conj_2", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_2", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "clone_78", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_66", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_78", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "mul_264", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_851", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_840", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_840", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_complex_67", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_851", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "_conj_3", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_3", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "clone_79", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_67", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_79", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "mul_265", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_264", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_real_66", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_66", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_841", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_841", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_852", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_265", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_as_real_67", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_67", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_842", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_842", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "convert_element_type_853", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_843", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_852", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_844", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_853", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "view_845", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_843", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_934", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_934", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_847", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "einsum_default_249", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_850", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "permute_411", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_934", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_411", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "einsum_default_250", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_249", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "permute_412", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_412", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "dtype_cast_307", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_307", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wv", + "name": "alias_default_1694", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_844", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_935", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_935", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_847", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "einsum_default_251", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_849", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "permute_415", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_935", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_415", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "einsum_default_252", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_250", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_252", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "add_172", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_251", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "permute_416", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_416", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "dtype_cast_308", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_308", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wk", + "name": "alias_default_1693", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_845", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention", + "name": "alias_default_936", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_936", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_847", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "einsum_default_253", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_848", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "permute_419", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_936", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_419", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "einsum_default_254", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_254", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30", + "name": "add_173", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_253", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "permute_420", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_420", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "dtype_cast_309", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_309", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention.wq", + "name": "alias_default_1692", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_866", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_843", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_867", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_844", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_868", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_866", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_937", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_937", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_868", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_266", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_867", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_846", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_267", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_938", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_267", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_939", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_939", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_938", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_268", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "sum_13", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_939", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "div_36", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_269", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_938", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "sub_6", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_846", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_270", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_937", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_939", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "mul_271", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_271", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "sum_14", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_270", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_869", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_14", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "convert_element_type_870", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_933", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_869", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "add_174", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_870", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "dtype_cast_310", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_310", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.30.attention_norm", + "name": "alias_default_1699", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_174", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "alias_default_940", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_940", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_841", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "einsum_default_255", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_842", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "permute_423", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_940", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_423", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "einsum_default_256", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_255", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "permute_424", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_424", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "dtype_cast_311", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_311", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "alias_default_1688", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_256", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w2", + "name": "alias_default_941", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_941", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_838", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_272", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_941", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_840", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_273", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_272", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_942", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_942", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_834", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "einsum_default_257", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_839", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "permute_427", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_942", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_427", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "einsum_default_258", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_257", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "permute_428", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_428", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "dtype_cast_312", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_312", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w3", + "name": "alias_default_1689", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_273", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "convert_element_type_879", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_836", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "convert_element_type_880", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_880", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_943", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_943", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "neg_34", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "exp_34", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "add_175", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_175", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "reciprocal_2", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_2", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_274", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_274", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_944", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_879", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_944", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_275", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_944", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "sub_7", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_943", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_276", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_276", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "add_176", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_275", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_176", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "mul_277", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_277", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "convert_element_type_881", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_881", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward", + "name": "alias_default_945", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_945", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_834", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "einsum_default_259", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_835", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "permute_431", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_945", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_431", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "einsum_default_260", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_258", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_260", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29", + "name": "add_177", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_259", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "permute_432", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_432", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "dtype_cast_313", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_313", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.feed_forward.w1", + "name": "alias_default_1687", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_177", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_886", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_830", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_887", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_831", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_888", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_886", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_946", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_946", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_888", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_278", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_887", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_833", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_279", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_278", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_947", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_279", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_948", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_948", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_947", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_280", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_280", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "sum_15", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_948", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "div_37", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_281", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_947", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_281", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "sub_8", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_833", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_282", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_946", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_948", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "mul_283", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "sum_16", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_282", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_889", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_16", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "convert_element_type_890", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_940", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_889", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "add_178", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_890", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "dtype_cast_314", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_314", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.ffn_norm", + "name": "alias_default_1691", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_178", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "alias_default_949", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_949", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_828", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "einsum_default_261", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_829", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "permute_435", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_949", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_435", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "einsum_default_262", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_261", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "permute_436", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_436", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "dtype_cast_315", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_315", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wo", + "name": "alias_default_1686", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_262", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_860", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_860", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_437", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_437", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_824", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_825", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_826", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_827", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_262", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_267", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_268", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_2", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_294", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_295", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_2", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.sdpa", + "name": "getitem_296", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_296", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_438", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_295", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_439", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_294", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "permute_440", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_438", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_861", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_861", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "sum_17", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "squeeze_4", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_439", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_862", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_862", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "sum_18", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "squeeze_5", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_895", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_440", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_896", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_895", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_863", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_863", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_complex_68", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_823", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "_conj_4", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_4", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "clone_86", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_68", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_86", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "mul_284", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_896", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_864", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_864", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_complex_69", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_823", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "_conj_5", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_5", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "clone_87", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_69", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_87", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "mul_285", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_284", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_real_68", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_68", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_865", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_865", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_897", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_285", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_as_real_69", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_69", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_866", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_866", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "convert_element_type_898", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_867", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_897", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_868", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_898", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "view_869", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_867", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_950", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_950", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_819", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "einsum_default_263", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_822", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "permute_443", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_950", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_443", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "einsum_default_264", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_263", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "permute_444", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_444", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "dtype_cast_316", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_316", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wv", + "name": "alias_default_1685", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_868", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_951", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_951", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_819", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "einsum_default_265", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_821", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "permute_447", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_951", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_447", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "einsum_default_266", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_264", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "add_179", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_265", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "permute_448", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_448", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "dtype_cast_317", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_317", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wk", + "name": "alias_default_1684", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_869", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention", + "name": "alias_default_952", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_952", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_819", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "einsum_default_267", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_820", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "permute_451", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_952", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_451", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "einsum_default_268", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_179", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29", + "name": "add_180", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_267", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "permute_452", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_452", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "dtype_cast_318", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_318", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention.wq", + "name": "alias_default_1683", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_180", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_911", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_815", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_912", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_816", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_913", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_911", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_953", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_953", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_913", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_286", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_912", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_818", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_287", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_286", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_954", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_287", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_955", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_955", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_954", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_288", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_288", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "sum_19", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_955", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "div_38", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_289", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_954", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_289", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "sub_9", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_818", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_290", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_953", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_955", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "mul_291", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "sum_20", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_290", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_914", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_20", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "convert_element_type_915", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_949", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_914", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "add_181", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_915", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "dtype_cast_319", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_319", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.29.attention_norm", + "name": "alias_default_1690", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_181", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "alias_default_956", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_956", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_813", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "einsum_default_269", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_814", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "permute_455", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_956", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_455", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "einsum_default_270", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_269", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "permute_456", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_456", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "dtype_cast_320", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_320", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "alias_default_1679", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_270", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w2", + "name": "alias_default_957", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_957", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_810", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_292", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_957", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_812", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_293", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_292", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_958", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_958", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_806", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "einsum_default_271", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_811", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "permute_459", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_958", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_459", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "einsum_default_272", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_271", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "permute_460", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_460", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "dtype_cast_321", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_321", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w3", + "name": "alias_default_1680", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_293", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "convert_element_type_924", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_808", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "convert_element_type_925", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_925", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_959", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_959", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "neg_35", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "exp_35", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "add_182", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_182", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "reciprocal_3", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_3", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_294", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_294", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_960", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_924", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_960", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_295", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_960", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "sub_10", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_959", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_296", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_296", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "add_183", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_295", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_183", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "mul_297", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_297", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "convert_element_type_926", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_926", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward", + "name": "alias_default_961", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_961", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_806", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "einsum_default_273", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_807", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "permute_463", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_961", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_463", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "einsum_default_274", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_272", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_274", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28", + "name": "add_184", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_273", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "permute_464", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_464", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "dtype_cast_322", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_322", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.feed_forward.w1", + "name": "alias_default_1678", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_184", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_931", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_802", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_932", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_803", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_933", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_931", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_962", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_962", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_933", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_298", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_932", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_805", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_299", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_963", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_299", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_964", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_964", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_963", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_300", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "sum_21", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_964", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "div_39", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_301", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_963", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "sub_11", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_805", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_302", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_962", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_964", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "mul_303", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_303", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "sum_22", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_302", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_934", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_22", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "convert_element_type_935", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_956", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_934", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "add_185", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_935", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "dtype_cast_323", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_323", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.ffn_norm", + "name": "alias_default_1682", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_185", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "alias_default_965", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_965", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_800", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "einsum_default_275", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_801", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "permute_467", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_965", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_467", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "einsum_default_276", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_275", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "permute_468", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_468", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "dtype_cast_324", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_324", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wo", + "name": "alias_default_1677", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_276", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_884", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_884", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_469", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_469", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_796", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_797", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_798", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_799", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_253", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_258", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_259", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_3", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_297", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_298", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.sdpa", + "name": "getitem_299", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_299", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_470", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_471", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_297", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "permute_472", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_470", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_885", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_885", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "sum_23", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "squeeze_6", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_471", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_886", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_886", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "sum_24", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "squeeze_7", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_940", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_472", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_941", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_940", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_887", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_887", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_complex_70", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_795", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "_conj_6", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_6", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "clone_94", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_70", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_94", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "mul_304", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_941", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_888", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_888", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_complex_71", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_795", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "_conj_7", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_7", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "clone_95", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_71", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_95", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "mul_305", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_304", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_real_70", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_70", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_889", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_889", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_942", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_305", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_as_real_71", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_71", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_890", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_890", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "convert_element_type_943", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_891", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_942", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_892", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_943", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "view_893", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_891", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_966", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_966", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_791", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "einsum_default_277", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_794", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "permute_475", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_966", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_475", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "einsum_default_278", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_277", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "permute_476", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_476", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "dtype_cast_325", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_325", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wv", + "name": "alias_default_1676", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_892", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_967", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_967", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_791", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "einsum_default_279", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_793", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "permute_479", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_967", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_479", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "einsum_default_280", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_278", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_280", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "add_186", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_279", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "permute_480", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_480", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "dtype_cast_326", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_326", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wk", + "name": "alias_default_1675", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_893", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention", + "name": "alias_default_968", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_968", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_791", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "einsum_default_281", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_792", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "permute_483", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_968", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_483", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "einsum_default_282", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_282", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28", + "name": "add_187", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_281", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "permute_484", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_484", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "dtype_cast_327", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_327", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention.wq", + "name": "alias_default_1674", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_187", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_956", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_787", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_957", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_788", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_958", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_956", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_969", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_969", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_958", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_306", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_957", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_790", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_307", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_306", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_970", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_307", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_971", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_971", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_970", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_308", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_308", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "sum_25", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_971", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "div_40", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_309", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_970", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_309", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "sub_12", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_790", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_310", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_969", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_971", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "mul_311", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "sum_26", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_310", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_959", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_26", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "convert_element_type_960", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_965", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_959", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "add_188", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_960", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "dtype_cast_328", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_328", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.28.attention_norm", + "name": "alias_default_1681", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_188", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "alias_default_972", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_972", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_785", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "einsum_default_283", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_786", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "permute_487", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_972", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_487", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "einsum_default_284", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_283", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "permute_488", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_488", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "dtype_cast_329", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_329", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "alias_default_1670", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_284", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w2", + "name": "alias_default_973", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_973", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_782", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_312", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_973", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_784", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_313", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_312", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_974", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_974", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_778", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "einsum_default_285", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_783", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "permute_491", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_974", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_491", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "einsum_default_286", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_285", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "permute_492", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_492", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "dtype_cast_330", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_330", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w3", + "name": "alias_default_1671", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_313", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "convert_element_type_969", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_780", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "convert_element_type_970", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_970", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_975", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_975", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "neg_36", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "exp_36", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "add_189", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_189", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "reciprocal_4", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_4", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_314", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_314", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_976", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_969", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_976", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_315", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_976", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "sub_13", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_975", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_316", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_316", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "add_190", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_315", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_190", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "mul_317", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_317", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "convert_element_type_971", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_971", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward", + "name": "alias_default_977", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_977", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_778", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "einsum_default_287", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_779", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "permute_495", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_977", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_495", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "einsum_default_288", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_286", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_288", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27", + "name": "add_191", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_287", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "permute_496", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_496", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "dtype_cast_331", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_331", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.feed_forward.w1", + "name": "alias_default_1669", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_191", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_976", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_774", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_977", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_775", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_978", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_976", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_978", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_978", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_978", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_318", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_977", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_777", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_319", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_318", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_979", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_319", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_980", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_980", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_979", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_320", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_320", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "sum_27", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_980", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "div_41", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_321", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_979", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_321", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "sub_14", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_777", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_322", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_978", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_980", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "mul_323", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "sum_28", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_979", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_28", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "convert_element_type_980", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_972", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_979", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "add_192", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_980", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "dtype_cast_332", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_332", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.ffn_norm", + "name": "alias_default_1673", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_192", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "alias_default_981", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_981", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_772", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "einsum_default_289", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_773", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "permute_499", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_981", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_499", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "einsum_default_290", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_289", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "permute_500", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_500", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "dtype_cast_333", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_333", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wo", + "name": "alias_default_1668", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_290", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_908", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_908", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_501", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_501", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_768", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_769", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_770", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_771", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_244", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_249", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_250", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_4", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_300", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_301", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_4", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.sdpa", + "name": "getitem_302", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_302", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_502", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_503", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "permute_504", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_502", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_909", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_909", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "sum_29", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "squeeze_8", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_503", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_910", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_910", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "sum_30", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "squeeze_9", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_985", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_504", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_986", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_985", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_911", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_911", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_complex_72", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_767", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "_conj_8", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_8", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "clone_102", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_72", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_102", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "mul_324", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_986", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_912", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_912", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_complex_73", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_767", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "_conj_9", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_9", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "clone_103", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_73", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_103", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "mul_325", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_324", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_real_72", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_72", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_913", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_913", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_987", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_325", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_as_real_73", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_73", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_914", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_914", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "convert_element_type_988", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_915", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_987", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_916", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_988", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "view_917", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_915", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_982", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_982", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_763", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "einsum_default_291", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_766", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "permute_507", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_982", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_507", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "einsum_default_292", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_291", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "permute_508", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_508", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "dtype_cast_334", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_334", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wv", + "name": "alias_default_1667", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_916", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_983", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_983", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_763", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "einsum_default_293", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_765", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "permute_511", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_983", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_511", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "einsum_default_294", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_292", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_294", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "add_193", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_293", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "permute_512", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_512", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "dtype_cast_335", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_335", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wk", + "name": "alias_default_1666", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_917", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention", + "name": "alias_default_984", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_984", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_763", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "einsum_default_295", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_764", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "permute_515", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_984", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_515", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "einsum_default_296", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_193", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_296", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27", + "name": "add_194", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_295", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "permute_516", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_516", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "dtype_cast_336", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_336", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention.wq", + "name": "alias_default_1665", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_194", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_1001", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_759", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_1002", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_760", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_1003", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1001", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_985", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_985", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1003", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_326", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1002", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_762", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_327", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_986", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_327", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_987", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_987", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_986", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_328", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_328", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "sum_31", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_987", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "div_42", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_329", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_986", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_329", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "sub_15", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_762", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_330", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_985", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_987", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "mul_331", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_331", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "sum_32", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_330", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_1004", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_32", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "convert_element_type_1005", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_981", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1004", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "add_195", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1005", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "dtype_cast_337", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_337", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.27.attention_norm", + "name": "alias_default_1672", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_195", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "alias_default_988", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_988", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_757", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "einsum_default_297", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_758", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "permute_519", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_988", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_519", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "einsum_default_298", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_297", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "permute_520", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_520", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "dtype_cast_338", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_338", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "alias_default_1661", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_298", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w2", + "name": "alias_default_989", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_989", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_754", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_332", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_989", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_756", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_333", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_332", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_990", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_990", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_750", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "einsum_default_299", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_755", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "permute_523", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_990", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_523", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "einsum_default_300", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_299", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "permute_524", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_524", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "dtype_cast_339", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_339", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w3", + "name": "alias_default_1662", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_333", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "convert_element_type_1014", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_752", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "convert_element_type_1015", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1015", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_991", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_991", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "neg_37", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "exp_37", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "add_196", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_196", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "reciprocal_5", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_5", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_334", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_334", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_992", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1014", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_992", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_335", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_992", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "sub_16", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_991", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_336", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_336", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "add_197", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_335", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "mul_337", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_337", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "convert_element_type_1016", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1016", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward", + "name": "alias_default_993", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_993", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_750", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "einsum_default_301", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_751", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "permute_527", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_993", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_527", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "einsum_default_302", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_300", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_302", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26", + "name": "add_198", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_301", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "permute_528", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_528", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "dtype_cast_340", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_340", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.feed_forward.w1", + "name": "alias_default_1660", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_198", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_1021", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_746", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_1022", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_747", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_1023", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1021", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_994", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_994", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1023", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_338", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1022", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_749", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_339", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_338", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_995", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_996", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_996", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_995", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_340", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_340", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "sum_33", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_996", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "div_43", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_341", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_995", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_341", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "sub_17", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_749", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_342", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_994", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_996", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "mul_343", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_343", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "sum_34", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_342", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_1024", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_34", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "convert_element_type_1025", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_988", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1024", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "add_199", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1025", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "dtype_cast_341", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_341", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.ffn_norm", + "name": "alias_default_1664", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_199", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "alias_default_997", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_997", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_744", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "einsum_default_303", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_745", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "permute_531", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_997", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_531", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "einsum_default_304", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_303", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "permute_532", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_532", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "dtype_cast_342", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_342", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wo", + "name": "alias_default_1659", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_304", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_932", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_932", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_533", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_533", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_740", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_741", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_742", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_743", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_235", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_240", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_241", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_5", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_303", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_304", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_5", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.sdpa", + "name": "getitem_305", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_305", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_534", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_304", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_535", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_303", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "permute_536", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_534", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_933", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_933", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "sum_35", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "squeeze_10", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_535", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_934", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_934", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "sum_36", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "squeeze_11", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_1030", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_536", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_1031", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1030", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_935", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_935", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_complex_74", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_739", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "_conj_10", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_10", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "clone_110", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_74", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_110", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "mul_344", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1031", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_936", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_936", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_complex_75", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_739", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "_conj_11", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "clone_111", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_75", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_111", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "mul_345", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_344", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_real_74", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_74", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_937", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_937", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_1032", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_345", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_as_real_75", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_75", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_938", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_938", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "convert_element_type_1033", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_939", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1032", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_940", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1033", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "view_941", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_939", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_998", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_998", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_735", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "einsum_default_305", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_738", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "permute_539", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_998", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_539", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "einsum_default_306", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_305", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "permute_540", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_540", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "dtype_cast_343", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_343", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wv", + "name": "alias_default_1658", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_940", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_999", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_999", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_735", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "einsum_default_307", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_737", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "permute_543", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_999", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_543", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "einsum_default_308", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_306", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_308", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "add_200", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_307", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "permute_544", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_544", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "dtype_cast_344", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_344", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wk", + "name": "alias_default_1657", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_941", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention", + "name": "alias_default_1000", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1000", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_735", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "einsum_default_309", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_736", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "permute_547", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1000", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_547", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "einsum_default_310", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_200", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_310", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26", + "name": "add_201", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_309", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "permute_548", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_548", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "dtype_cast_345", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_345", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention.wq", + "name": "alias_default_1656", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_201", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_1046", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_731", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_1047", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_732", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_1048", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1046", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_1001", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1001", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1048", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_346", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1047", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_734", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_347", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_346", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_1002", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_347", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_1003", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1003", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1002", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_348", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "sum_37", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1003", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "div_44", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_349", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1002", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "sub_18", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_734", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_350", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1001", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1003", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "mul_351", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_351", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "sum_38", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_350", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_1049", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_38", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "convert_element_type_1050", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_997", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1049", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "add_202", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1050", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "dtype_cast_346", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_346", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.26.attention_norm", + "name": "alias_default_1663", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_202", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "alias_default_1004", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1004", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_729", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "einsum_default_311", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_730", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "permute_551", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1004", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_551", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "einsum_default_312", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_311", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "permute_552", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_552", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "dtype_cast_347", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_347", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "alias_default_1652", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_312", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w2", + "name": "alias_default_1005", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1005", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_726", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_352", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1005", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_728", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_353", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_352", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_1006", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1006", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_722", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "einsum_default_313", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_727", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "permute_555", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1006", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_555", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "einsum_default_314", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_313", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "permute_556", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_556", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "dtype_cast_348", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_348", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w3", + "name": "alias_default_1653", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_353", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "convert_element_type_1059", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_724", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "convert_element_type_1060", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1060", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_1007", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1007", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "neg_38", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "exp_38", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "add_203", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_203", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "reciprocal_6", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_6", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_354", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_354", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_1008", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1059", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1008", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_355", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1008", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "sub_19", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1007", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_356", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_356", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "add_204", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_355", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_204", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "mul_357", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_357", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "convert_element_type_1061", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1061", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward", + "name": "alias_default_1009", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1009", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_722", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "einsum_default_315", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_723", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "permute_559", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1009", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_559", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "einsum_default_316", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_314", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_316", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25", + "name": "add_205", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_315", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "permute_560", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_560", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "dtype_cast_349", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_349", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.feed_forward.w1", + "name": "alias_default_1651", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_205", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_1066", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_718", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_1067", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_719", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_1068", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1066", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_1010", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1010", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1068", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_358", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1067", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_721", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_359", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_358", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_1011", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_359", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_1012", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1012", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1011", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_360", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_360", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "sum_39", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1012", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "div_45", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_361", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1011", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_361", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "sub_20", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_721", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_362", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1010", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1012", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "mul_363", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_363", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "sum_40", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_1069", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_40", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "convert_element_type_1070", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1004", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1069", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "add_206", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1070", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "dtype_cast_350", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_350", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.ffn_norm", + "name": "alias_default_1655", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_206", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "alias_default_1013", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1013", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_716", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "einsum_default_317", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_717", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "permute_563", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1013", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_563", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "einsum_default_318", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_317", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "permute_564", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_564", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "dtype_cast_351", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_351", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wo", + "name": "alias_default_1650", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_318", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_956", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_956", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_565", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_565", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_712", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_713", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_714", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_715", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_226", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_231", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_232", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_6", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_306", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_307", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.sdpa", + "name": "getitem_308", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_308", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_566", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_307", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_567", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_306", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "permute_568", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_566", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_957", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_957", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "sum_41", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "squeeze_12", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_567", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_958", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_958", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "sum_42", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "squeeze_13", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_1075", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_568", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_1076", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1075", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_959", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_959", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_complex_76", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_711", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "_conj_12", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_12", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "clone_118", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_76", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_118", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "mul_364", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1076", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_960", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_960", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_complex_77", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_711", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "_conj_13", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_13", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "clone_119", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_77", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_119", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "mul_365", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_364", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_real_76", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_76", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_961", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_961", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_1077", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_365", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_as_real_77", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_77", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_962", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_962", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "convert_element_type_1078", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_963", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1077", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_964", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1078", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "view_965", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_963", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_1014", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1014", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_707", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "einsum_default_319", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_710", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "permute_571", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1014", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_571", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "einsum_default_320", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_319", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "permute_572", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_572", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "dtype_cast_352", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_352", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wv", + "name": "alias_default_1649", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_964", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_1015", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1015", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_707", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "einsum_default_321", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_709", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "permute_575", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1015", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_575", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "einsum_default_322", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_320", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "add_207", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_321", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "permute_576", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_576", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "dtype_cast_353", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_353", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wk", + "name": "alias_default_1648", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_965", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention", + "name": "alias_default_1016", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1016", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_707", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "einsum_default_323", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_708", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "permute_579", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1016", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_579", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "einsum_default_324", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_207", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_324", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25", + "name": "add_208", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_323", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "permute_580", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_580", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "dtype_cast_354", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_354", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention.wq", + "name": "alias_default_1647", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_208", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_1091", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_703", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_1092", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_704", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_1093", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1091", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_1017", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1017", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1093", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_366", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1092", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_706", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_367", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_366", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_1018", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_367", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_1019", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1019", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1018", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_368", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_368", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "sum_43", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1019", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "div_46", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_369", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1018", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "sub_21", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_706", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_370", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1017", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1019", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "mul_371", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_371", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "sum_44", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_1094", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_44", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "convert_element_type_1095", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1013", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1094", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "add_209", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1095", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "dtype_cast_355", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_355", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.25.attention_norm", + "name": "alias_default_1654", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_209", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "alias_default_1020", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1020", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_701", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "einsum_default_325", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_702", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "permute_583", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1020", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_583", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "einsum_default_326", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_325", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "permute_584", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_584", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "dtype_cast_356", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_356", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "alias_default_1643", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_326", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w2", + "name": "alias_default_1021", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1021", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_698", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_372", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1021", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_700", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_373", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_372", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_1022", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1022", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_694", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "einsum_default_327", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_699", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "permute_587", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1022", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_587", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "einsum_default_328", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_327", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "permute_588", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_588", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "dtype_cast_357", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_357", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w3", + "name": "alias_default_1644", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_373", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "convert_element_type_1104", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_696", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "convert_element_type_1105", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_1023", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1023", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "neg_39", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "exp_39", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "add_210", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_210", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "reciprocal_7", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_7", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_374", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_374", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_1024", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1024", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_375", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1024", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "sub_22", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1023", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_376", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_376", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "add_211", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_375", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_211", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "mul_377", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_377", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "convert_element_type_1106", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward", + "name": "alias_default_1025", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1025", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_694", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "einsum_default_329", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_695", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "permute_591", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1025", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_591", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "einsum_default_330", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_328", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_330", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24", + "name": "add_212", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_329", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "permute_592", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_592", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "dtype_cast_358", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_358", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.feed_forward.w1", + "name": "alias_default_1642", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_212", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_1111", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_690", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_1112", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_691", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_1113", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1111", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_1026", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1026", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1113", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_378", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1112", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_693", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_379", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_1027", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_379", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_1028", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1028", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1027", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_380", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "sum_45", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1028", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "div_47", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_381", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1027", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "sub_23", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_693", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_382", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1026", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1028", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "mul_383", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_383", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "sum_46", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_1114", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_46", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "convert_element_type_1115", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1020", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1114", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "add_213", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1115", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "dtype_cast_359", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_359", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.ffn_norm", + "name": "alias_default_1646", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_213", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "alias_default_1029", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1029", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_688", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "einsum_default_331", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_689", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "permute_595", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1029", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_595", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "einsum_default_332", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_331", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "permute_596", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_596", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "dtype_cast_360", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_360", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wo", + "name": "alias_default_1641", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_332", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_980", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_980", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_597", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_597", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_684", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_685", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_686", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_687", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_217", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_222", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_223", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_7", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_309", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_310", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_7", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.sdpa", + "name": "getitem_311", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_598", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_310", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_599", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_309", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "permute_600", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_598", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_981", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_981", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "sum_47", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "squeeze_14", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_599", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_982", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_982", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "sum_48", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "squeeze_15", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_1120", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_600", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_1121", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_983", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_983", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_complex_78", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_683", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "_conj_14", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_14", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "clone_126", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_78", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_126", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "mul_384", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_984", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_984", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_complex_79", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_683", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "_conj_15", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_15", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "clone_127", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_79", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_127", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "mul_385", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_384", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_real_78", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_78", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_985", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_985", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_1122", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_385", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_as_real_79", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_79", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_986", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_986", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "convert_element_type_1123", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_987", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1122", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_988", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1123", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "view_989", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_987", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_1030", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1030", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_679", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "einsum_default_333", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_682", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "permute_603", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1030", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_603", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "einsum_default_334", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_333", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "permute_604", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_604", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "dtype_cast_361", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_361", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wv", + "name": "alias_default_1640", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_988", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_1031", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1031", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_679", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "einsum_default_335", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_681", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "permute_607", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1031", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_607", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "einsum_default_336", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_334", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_336", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "add_214", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_335", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "permute_608", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_608", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "dtype_cast_362", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_362", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wk", + "name": "alias_default_1639", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_989", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention", + "name": "alias_default_1032", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1032", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_679", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "einsum_default_337", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_680", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "permute_611", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1032", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_611", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "einsum_default_338", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_338", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24", + "name": "add_215", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_337", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "permute_612", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_612", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "dtype_cast_363", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_363", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention.wq", + "name": "alias_default_1638", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_215", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_1136", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_675", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_1137", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_676", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_1138", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1136", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_1033", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1033", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1138", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_386", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1137", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_678", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_387", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_386", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_1034", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_387", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_1035", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1035", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1034", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_388", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_388", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "sum_49", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1035", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "div_48", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_389", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1034", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_389", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "sub_24", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_678", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_390", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1033", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1035", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "mul_391", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_391", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "sum_50", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_390", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_1139", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_50", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "convert_element_type_1140", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1029", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1139", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "add_216", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1140", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "dtype_cast_364", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_364", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.24.attention_norm", + "name": "alias_default_1645", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_216", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "alias_default_1036", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1036", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_673", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "einsum_default_339", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_674", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "permute_615", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1036", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_615", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "einsum_default_340", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_339", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "permute_616", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_616", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "dtype_cast_365", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_365", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "alias_default_1634", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_340", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w2", + "name": "alias_default_1037", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1037", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_670", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_392", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1037", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_672", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_393", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_1038", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1038", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_666", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "einsum_default_341", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_671", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "permute_619", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1038", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_619", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "einsum_default_342", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_341", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "permute_620", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_620", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "dtype_cast_366", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_366", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w3", + "name": "alias_default_1635", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "convert_element_type_1149", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_668", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "convert_element_type_1150", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1150", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_1039", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1039", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "neg_40", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "exp_40", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "add_217", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_217", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "reciprocal_8", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_8", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_394", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_394", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_1040", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1040", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_395", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1040", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "sub_25", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1039", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_396", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_396", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "add_218", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_395", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_218", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "mul_397", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_397", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "convert_element_type_1151", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1151", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward", + "name": "alias_default_1041", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1041", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_666", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "einsum_default_343", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_667", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "permute_623", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1041", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_623", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "einsum_default_344", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_342", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_344", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23", + "name": "add_219", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_343", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "permute_624", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_624", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "dtype_cast_367", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_367", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.feed_forward.w1", + "name": "alias_default_1633", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_219", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_1156", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_662", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_1157", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_663", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_1158", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_1042", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1042", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1158", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_398", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_665", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_399", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_398", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_1043", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_399", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_1044", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1044", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1043", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_400", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_400", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "sum_51", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1044", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "div_49", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_401", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1043", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_401", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "sub_26", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_665", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_402", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1042", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1044", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "mul_403", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_403", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "sum_52", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_402", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_1159", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_52", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "convert_element_type_1160", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1036", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1159", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "add_220", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1160", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "dtype_cast_368", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_368", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.ffn_norm", + "name": "alias_default_1637", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_220", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "alias_default_1045", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1045", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_660", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "einsum_default_345", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_661", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "permute_627", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1045", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_627", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "einsum_default_346", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_345", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "permute_628", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_628", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "dtype_cast_369", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_369", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wo", + "name": "alias_default_1632", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_346", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1004", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1004", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_629", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_629", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_656", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_657", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_658", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_659", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_208", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_213", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_214", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_8", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_312", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_313", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_8", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.sdpa", + "name": "getitem_314", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_314", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_630", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_313", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_631", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_312", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "permute_632", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_630", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1005", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1005", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "sum_53", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "squeeze_16", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_631", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1006", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1006", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "sum_54", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "squeeze_17", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_1165", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_632", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_1166", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1007", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1007", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_complex_80", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_655", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "_conj_16", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_16", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "clone_134", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_80", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_134", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "mul_404", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1166", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1008", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1008", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_complex_81", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_655", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "_conj_17", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_17", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "clone_135", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_135", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "mul_405", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_404", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_real_80", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_80", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1009", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1009", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_1167", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_405", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_as_real_81", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_81", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1010", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1010", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "convert_element_type_1168", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1011", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1167", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1012", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "view_1013", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1011", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_1046", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1046", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_651", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "einsum_default_347", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_654", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "permute_635", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1046", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_635", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "einsum_default_348", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_347", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "permute_636", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_636", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "dtype_cast_370", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_370", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wv", + "name": "alias_default_1631", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1012", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_1047", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1047", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_651", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "einsum_default_349", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_653", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "permute_639", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1047", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_639", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "einsum_default_350", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_350", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "add_221", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_349", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "permute_640", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_640", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "dtype_cast_371", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_371", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wk", + "name": "alias_default_1630", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1013", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention", + "name": "alias_default_1048", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1048", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_651", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "einsum_default_351", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_652", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "permute_643", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1048", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_643", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "einsum_default_352", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_221", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_352", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23", + "name": "add_222", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_351", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "permute_644", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_644", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "dtype_cast_372", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_372", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention.wq", + "name": "alias_default_1629", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_222", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_1181", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_647", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_1182", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_648", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_1183", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1181", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_1049", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1049", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1183", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_406", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1182", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_650", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_407", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_406", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_1050", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_407", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_1051", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1051", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1050", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_408", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_408", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "sum_55", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1051", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "div_50", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_409", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1050", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_409", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "sub_27", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_650", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_410", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1049", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1051", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "mul_411", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_411", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "sum_56", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_410", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_1184", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_56", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "convert_element_type_1185", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1045", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1184", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "add_223", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1185", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "dtype_cast_373", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_373", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.23.attention_norm", + "name": "alias_default_1636", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_223", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "alias_default_1052", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1052", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_645", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "einsum_default_353", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_646", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "permute_647", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1052", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_647", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "einsum_default_354", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_353", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "permute_648", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_648", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "dtype_cast_374", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_374", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "alias_default_1625", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_354", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w2", + "name": "alias_default_1053", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1053", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_642", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_412", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1053", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_644", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_413", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_412", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_1054", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1054", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_638", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "einsum_default_355", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_643", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "permute_651", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1054", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_651", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "einsum_default_356", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_355", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "permute_652", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_652", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "dtype_cast_375", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_375", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w3", + "name": "alias_default_1626", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_413", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "convert_element_type_1194", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_640", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "convert_element_type_1195", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1195", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_1055", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1055", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "neg_41", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "exp_41", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "add_224", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "reciprocal_9", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_9", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_414", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_414", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_1056", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1194", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1056", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_415", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1056", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "sub_28", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1055", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_416", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_416", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "add_225", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_415", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_225", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "mul_417", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_417", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "convert_element_type_1196", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1196", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward", + "name": "alias_default_1057", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1057", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_638", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "einsum_default_357", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_639", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "permute_655", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1057", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_655", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "einsum_default_358", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_356", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_358", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22", + "name": "add_226", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_357", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "permute_656", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_656", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "dtype_cast_376", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_376", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.feed_forward.w1", + "name": "alias_default_1624", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_226", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_1201", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_634", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_1202", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_635", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_1203", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1201", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_1058", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1058", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1203", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_418", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_637", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_419", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_418", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_1059", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_419", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_1060", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1060", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1059", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_420", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_420", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "sum_57", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1060", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "div_51", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_421", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1059", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_421", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "sub_29", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_637", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_422", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1058", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1060", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "mul_423", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_423", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "sum_58", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_422", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_1204", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_58", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "convert_element_type_1205", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1052", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1204", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "add_227", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1205", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "dtype_cast_377", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_377", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.ffn_norm", + "name": "alias_default_1628", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "alias_default_1061", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1061", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_632", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "einsum_default_359", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_633", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "permute_659", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1061", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_659", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "einsum_default_360", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_359", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "permute_660", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_660", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "dtype_cast_378", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_378", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wo", + "name": "alias_default_1623", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_360", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1028", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1028", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_661", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_661", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_628", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_629", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_630", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_631", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_199", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_204", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_205", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_9", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_315", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_316", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_9", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.sdpa", + "name": "getitem_317", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_317", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_662", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_316", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_663", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_315", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "permute_664", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_662", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1029", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1029", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "sum_59", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "squeeze_18", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_663", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1030", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1030", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "sum_60", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "squeeze_19", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_1210", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_664", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_1211", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1210", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1031", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1031", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_complex_82", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_627", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "_conj_18", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_18", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "clone_142", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_142", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "mul_424", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1211", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1032", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1032", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_complex_83", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_627", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "_conj_19", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_19", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "clone_143", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_143", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "mul_425", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_424", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_real_82", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1033", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1033", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_1212", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_425", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_as_real_83", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1034", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1034", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "convert_element_type_1213", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1035", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1212", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1036", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1213", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "view_1037", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1035", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_1062", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1062", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_623", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "einsum_default_361", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_626", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "permute_667", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1062", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_667", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "einsum_default_362", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_361", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "permute_668", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_668", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "dtype_cast_379", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_379", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wv", + "name": "alias_default_1622", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1036", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_1063", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1063", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_623", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "einsum_default_363", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_625", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "permute_671", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1063", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_671", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "einsum_default_364", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_364", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "add_228", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_363", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "permute_672", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_672", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "dtype_cast_380", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_380", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wk", + "name": "alias_default_1621", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1037", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention", + "name": "alias_default_1064", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1064", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_623", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "einsum_default_365", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_624", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "permute_675", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1064", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_675", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "einsum_default_366", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_228", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_366", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22", + "name": "add_229", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_365", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "permute_676", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_676", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "dtype_cast_381", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_381", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention.wq", + "name": "alias_default_1620", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_229", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_1226", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_619", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_1227", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_620", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_1228", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1226", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_1065", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1065", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1228", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_426", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_622", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_427", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_426", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_1066", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_427", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_1067", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1067", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1066", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_428", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_428", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "sum_61", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1067", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "div_52", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_52", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_429", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1066", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_429", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "sub_30", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_622", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_430", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1065", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1067", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "mul_431", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_431", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "sum_62", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_430", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_1229", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_62", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "convert_element_type_1230", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1061", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1229", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "add_230", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1230", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "dtype_cast_382", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_382", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.22.attention_norm", + "name": "alias_default_1627", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_230", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "alias_default_1068", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1068", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_617", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "einsum_default_367", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_618", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "permute_679", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1068", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_679", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "einsum_default_368", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_367", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "permute_680", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_680", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "dtype_cast_383", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_383", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "alias_default_1616", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_368", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w2", + "name": "alias_default_1069", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1069", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_614", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_432", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1069", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_616", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_433", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_432", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_1070", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1070", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_610", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "einsum_default_369", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_615", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "permute_683", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1070", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_683", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "einsum_default_370", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_369", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "permute_684", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_684", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "dtype_cast_384", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_384", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w3", + "name": "alias_default_1617", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_433", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "convert_element_type_1239", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_612", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "convert_element_type_1240", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1240", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_1071", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1071", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "neg_42", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "exp_42", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "add_231", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_231", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "reciprocal_10", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_10", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_434", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_434", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_1072", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1239", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1072", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_435", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1072", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "sub_31", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1071", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_436", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_436", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "add_232", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_435", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_232", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "mul_437", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_437", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "convert_element_type_1241", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1241", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward", + "name": "alias_default_1073", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1073", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_610", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "einsum_default_371", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_611", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "permute_687", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1073", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_687", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "einsum_default_372", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_370", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_372", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21", + "name": "add_233", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_371", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "permute_688", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_688", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "dtype_cast_385", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_385", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.feed_forward.w1", + "name": "alias_default_1615", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_233", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_1246", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_606", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_1247", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_607", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_1248", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1246", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_1074", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1074", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1248", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_438", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1247", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_609", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_439", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_438", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_1075", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_439", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_1076", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1076", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1075", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_440", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_440", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "sum_63", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1076", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "div_53", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_53", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_441", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1075", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_441", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "sub_32", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_32", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_609", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_442", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1074", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1076", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "mul_443", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_443", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "sum_64", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_442", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_1249", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_64", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "convert_element_type_1250", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1068", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1249", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "add_234", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1250", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "dtype_cast_386", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_386", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.ffn_norm", + "name": "alias_default_1619", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_234", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "alias_default_1077", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1077", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_604", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "einsum_default_373", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_605", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "permute_691", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1077", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_691", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "einsum_default_374", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_373", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "permute_692", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_692", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "dtype_cast_387", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_387", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wo", + "name": "alias_default_1614", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_374", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1052", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1052", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_693", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_693", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_600", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_601", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_602", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_603", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_190", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_195", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_196", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_10", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_318", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_319", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.sdpa", + "name": "getitem_320", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_320", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_694", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_319", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_695", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_318", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "permute_696", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_694", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1053", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1053", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "sum_65", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_65", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "squeeze_20", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_695", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1054", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1054", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "sum_66", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_66", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "squeeze_21", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_1255", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_696", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_1256", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1255", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1055", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1055", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_complex_84", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_599", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "_conj_20", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_20", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "clone_150", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_150", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "mul_444", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1256", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1056", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1056", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_complex_85", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_599", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "_conj_21", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_21", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "clone_151", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "mul_445", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_444", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_real_84", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1057", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1057", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_1257", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_445", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_as_real_85", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1058", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1058", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "convert_element_type_1258", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1059", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1257", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1060", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1258", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "view_1061", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1059", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_1078", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1078", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_595", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "einsum_default_375", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_598", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "permute_699", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1078", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_699", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "einsum_default_376", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_375", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "permute_700", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_700", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "dtype_cast_388", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_388", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wv", + "name": "alias_default_1613", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1060", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_1079", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1079", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_595", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "einsum_default_377", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_597", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "permute_703", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1079", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_703", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "einsum_default_378", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_376", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "add_235", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_377", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "permute_704", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_704", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "dtype_cast_389", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_389", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wk", + "name": "alias_default_1612", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1061", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention", + "name": "alias_default_1080", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1080", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_595", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "einsum_default_379", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_596", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "permute_707", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1080", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_707", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "einsum_default_380", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_235", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21", + "name": "add_236", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_379", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "permute_708", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_708", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "dtype_cast_390", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_390", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention.wq", + "name": "alias_default_1611", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_1271", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_591", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_1272", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_592", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_1273", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1271", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_1081", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1081", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1273", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_446", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1272", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_594", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_447", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_446", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_1082", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_447", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_1083", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1083", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1082", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_448", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_448", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "sum_67", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1083", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "div_54", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_67", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_449", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1082", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_449", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "sub_33", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_33", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_594", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_450", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1081", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1083", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "mul_451", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_451", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "sum_68", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_450", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_1274", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_68", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "convert_element_type_1275", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1077", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1274", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "add_237", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1275", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "dtype_cast_391", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_391", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.21.attention_norm", + "name": "alias_default_1618", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_237", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "alias_default_1084", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1084", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_589", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "einsum_default_381", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_590", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "permute_711", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1084", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_711", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "einsum_default_382", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_381", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "permute_712", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_712", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "dtype_cast_392", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_392", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "alias_default_1607", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_382", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w2", + "name": "alias_default_1085", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1085", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_586", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_452", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1085", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_588", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_453", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_452", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_1086", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1086", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_582", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "einsum_default_383", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_587", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "permute_715", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1086", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_715", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "einsum_default_384", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_383", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "permute_716", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_716", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "dtype_cast_393", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_393", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w3", + "name": "alias_default_1608", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_453", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "convert_element_type_1284", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_584", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "convert_element_type_1285", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1285", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_1087", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1087", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "neg_43", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "exp_43", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "add_238", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_238", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "reciprocal_11", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_11", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_454", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_454", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_1088", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1284", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1088", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_455", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1088", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "sub_34", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1087", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_456", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_456", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "add_239", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_455", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_239", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "mul_457", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_457", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "convert_element_type_1286", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1286", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward", + "name": "alias_default_1089", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1089", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_582", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "einsum_default_385", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_583", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "permute_719", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1089", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_719", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "einsum_default_386", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_384", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_386", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20", + "name": "add_240", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_385", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "permute_720", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_720", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "dtype_cast_394", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_394", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.feed_forward.w1", + "name": "alias_default_1606", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_240", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_1291", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_578", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_1292", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_579", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_1293", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_1090", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1090", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1293", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_458", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1292", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_581", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_459", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_458", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_1091", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_459", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_1092", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1092", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1091", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_460", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_460", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "sum_69", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1092", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "div_55", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_461", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1091", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_461", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "sub_35", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_35", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_581", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_462", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1090", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1092", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "mul_463", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_463", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "sum_70", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_462", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_1294", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_70", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "convert_element_type_1295", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1084", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1294", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "add_241", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1295", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "dtype_cast_395", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_395", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.ffn_norm", + "name": "alias_default_1610", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_241", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "alias_default_1093", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1093", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_576", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "einsum_default_387", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_577", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "permute_723", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1093", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_723", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "einsum_default_388", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_387", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "permute_724", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_724", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "dtype_cast_396", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_396", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wo", + "name": "alias_default_1605", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_388", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1076", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1076", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_725", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_725", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_572", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_573", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_574", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_575", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_181", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_186", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_187", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_11", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_321", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_322", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_11", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.sdpa", + "name": "getitem_323", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_726", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_727", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_321", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "permute_728", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_726", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1077", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1077", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "sum_71", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_71", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "squeeze_22", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_727", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1078", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1078", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "sum_72", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_72", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "squeeze_23", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_1300", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_728", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_1301", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1300", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1079", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1079", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_complex_86", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_571", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "_conj_22", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_22", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "clone_158", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_86", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_158", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "mul_464", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1301", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1080", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1080", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_complex_87", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_571", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "_conj_23", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_23", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "clone_159", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_87", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_159", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "mul_465", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_464", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_real_86", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_86", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1081", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1081", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_1302", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_465", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_as_real_87", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_87", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1082", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1082", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "convert_element_type_1303", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1083", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1302", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1084", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1303", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "view_1085", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1083", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_1094", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1094", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_567", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "einsum_default_389", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_570", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "permute_731", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1094", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_731", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "einsum_default_390", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_389", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "permute_732", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_732", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "dtype_cast_397", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_397", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wv", + "name": "alias_default_1604", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1084", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_1095", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1095", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_567", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "einsum_default_391", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_569", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "permute_735", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1095", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_735", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "einsum_default_392", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_390", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_392", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "add_242", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_391", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "permute_736", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_736", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "dtype_cast_398", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_398", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wk", + "name": "alias_default_1603", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1085", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention", + "name": "alias_default_1096", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1096", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_567", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "einsum_default_393", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_568", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "permute_739", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1096", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_739", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "einsum_default_394", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_394", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20", + "name": "add_243", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_393", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "permute_740", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_740", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "dtype_cast_399", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_399", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention.wq", + "name": "alias_default_1602", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_243", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_1316", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_563", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_1317", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_564", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_1318", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1316", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_1097", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1097", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1318", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_466", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1317", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_566", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_467", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_466", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_1098", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_467", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_1099", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1099", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1098", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_468", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_468", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "sum_73", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1099", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "div_56", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_73", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_469", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1098", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_469", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "sub_36", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_36", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_566", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_470", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1097", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1099", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "mul_471", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_471", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "sum_74", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_470", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_1319", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_74", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "convert_element_type_1320", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1093", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1319", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "add_244", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1320", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "dtype_cast_400", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_400", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.20.attention_norm", + "name": "alias_default_1609", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_244", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "alias_default_1100", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1100", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_561", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "einsum_default_395", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_562", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "permute_743", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1100", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_743", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "einsum_default_396", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_395", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "permute_744", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_744", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "dtype_cast_401", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_401", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "alias_default_1598", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_396", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w2", + "name": "alias_default_1101", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_558", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_472", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_560", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_473", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_472", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_1102", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_554", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "einsum_default_397", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_559", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "permute_747", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_747", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "einsum_default_398", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_397", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "permute_748", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_748", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "dtype_cast_402", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_402", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w3", + "name": "alias_default_1599", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_473", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "convert_element_type_1329", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_556", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "convert_element_type_1330", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1330", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_1103", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "neg_44", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "exp_44", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "add_245", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_245", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "reciprocal_12", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_12", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_474", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_474", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_1104", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1329", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_475", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "sub_37", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_476", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_476", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "add_246", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_475", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_246", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "mul_477", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_477", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "convert_element_type_1331", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1331", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward", + "name": "alias_default_1105", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_554", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "einsum_default_399", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_555", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "permute_751", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_751", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "einsum_default_400", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_398", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_400", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19", + "name": "add_247", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_399", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "permute_752", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_752", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "dtype_cast_403", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_403", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.feed_forward.w1", + "name": "alias_default_1597", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_247", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_1336", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_550", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_1337", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_551", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_1338", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1336", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_1106", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1106", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1338", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_478", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1337", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_553", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_479", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_478", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_1107", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_479", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_1108", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1108", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1107", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_480", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_480", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "sum_75", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1108", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "div_57", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_75", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_481", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1107", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_481", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "sub_38", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_38", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_553", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_482", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1106", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1108", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "mul_483", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_483", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "sum_76", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_482", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_1339", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_76", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "convert_element_type_1340", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1100", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "add_248", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1340", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "dtype_cast_404", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_404", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.ffn_norm", + "name": "alias_default_1601", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_248", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "alias_default_1109", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_548", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "einsum_default_401", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_549", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "permute_755", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_755", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "einsum_default_402", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_401", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "permute_756", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_756", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "dtype_cast_405", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_405", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wo", + "name": "alias_default_1596", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_402", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1100", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1100", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_757", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_757", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_544", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_545", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_546", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_547", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_177", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_178", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_12", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_324", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_325", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.sdpa", + "name": "getitem_326", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_758", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_325", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_759", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_324", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "permute_760", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_758", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1101", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "sum_77", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_77", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "squeeze_24", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_759", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1102", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "sum_78", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_78", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "squeeze_25", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_1345", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_760", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_1346", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1345", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1103", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_complex_88", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_543", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "_conj_24", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_24", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "clone_166", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_88", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_166", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "mul_484", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1346", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1104", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_complex_89", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_543", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "_conj_25", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_25", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "clone_167", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_89", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_167", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "mul_485", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_484", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_real_88", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_88", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1105", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_1347", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_485", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_as_real_89", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_89", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1106", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "convert_element_type_1348", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1107", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1347", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1108", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1348", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "view_1109", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_1110", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_539", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "einsum_default_403", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_542", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "permute_763", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1110", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_763", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "einsum_default_404", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_403", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "permute_764", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_764", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "dtype_cast_406", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_406", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wv", + "name": "alias_default_1595", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_1111", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1111", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_539", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "einsum_default_405", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_541", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "permute_767", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1111", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_767", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "einsum_default_406", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_404", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_406", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "add_249", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_405", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "permute_768", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_768", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "dtype_cast_407", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_407", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wk", + "name": "alias_default_1594", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention", + "name": "alias_default_1112", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_539", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "einsum_default_407", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_540", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "permute_771", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1112", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_771", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "einsum_default_408", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_249", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_408", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19", + "name": "add_250", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_407", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "permute_772", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_772", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "dtype_cast_408", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_408", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention.wq", + "name": "alias_default_1593", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_250", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_1361", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_535", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_1362", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_536", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_1363", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1361", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_1113", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1113", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1363", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_486", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_538", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_487", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_486", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_1114", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_487", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_1115", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1114", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_488", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_488", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "sum_79", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "div_58", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_58", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_79", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_489", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1114", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_489", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "sub_39", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_39", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_538", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_490", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1113", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "mul_491", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_491", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "sum_80", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_490", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_1364", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_80", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "convert_element_type_1365", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1364", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "add_251", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1365", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "dtype_cast_409", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_409", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.19.attention_norm", + "name": "alias_default_1600", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_251", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "alias_default_1116", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1116", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_533", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "einsum_default_409", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_534", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "permute_775", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1116", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_775", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "einsum_default_410", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_409", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "permute_776", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_776", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "dtype_cast_410", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_410", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "alias_default_1589", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_410", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w2", + "name": "alias_default_1117", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1117", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_530", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_492", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1117", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_532", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_493", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_492", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_1118", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1118", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_526", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "einsum_default_411", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_531", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "permute_779", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1118", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_779", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "einsum_default_412", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_411", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "permute_780", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_780", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "dtype_cast_411", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_411", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w3", + "name": "alias_default_1590", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_493", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "convert_element_type_1374", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_528", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "convert_element_type_1375", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1375", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_1119", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1119", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "neg_45", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "exp_45", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "add_252", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_252", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "reciprocal_13", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_13", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_494", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_494", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_1120", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1374", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_495", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "sub_40", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1119", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_496", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_496", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "add_253", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_495", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_253", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "mul_497", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_497", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "convert_element_type_1376", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1376", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward", + "name": "alias_default_1121", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_526", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "einsum_default_413", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_527", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "permute_783", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_783", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "einsum_default_414", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_412", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_414", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18", + "name": "add_254", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_413", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "permute_784", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_784", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "dtype_cast_412", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_412", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.feed_forward.w1", + "name": "alias_default_1588", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_254", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_1381", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_522", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_1382", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_523", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_1383", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_1122", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1122", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1383", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_498", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_525", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_499", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_498", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_1123", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_499", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_1124", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1123", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_500", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_500", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "sum_81", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "div_59", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_501", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1123", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_501", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "sub_41", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_525", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_502", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1122", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "mul_503", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_503", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "sum_82", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_502", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_1384", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_82", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "convert_element_type_1385", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1116", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1384", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "add_255", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1385", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "dtype_cast_413", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_413", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.ffn_norm", + "name": "alias_default_1592", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_255", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "alias_default_1125", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_520", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "einsum_default_415", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_521", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "permute_787", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_787", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "einsum_default_416", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_415", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "permute_788", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_788", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "dtype_cast_414", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_414", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wo", + "name": "alias_default_1587", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_416", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1124", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1124", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_789", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_789", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_516", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_517", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_518", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_519", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_163", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_168", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_169", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_13", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_327", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_328", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.sdpa", + "name": "getitem_329", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_329", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_790", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_328", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_791", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_327", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "permute_792", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_790", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1125", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1125", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "sum_83", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_83", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "squeeze_26", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_791", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1126", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1126", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "sum_84", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "squeeze_27", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_1390", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_792", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_1391", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1127", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1127", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_complex_90", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_515", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "_conj_26", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_26", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "clone_174", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_90", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_174", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "mul_504", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1391", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1128", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1128", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_complex_91", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_515", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "_conj_27", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_27", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "clone_175", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_91", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_175", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "mul_505", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_504", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_real_90", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_90", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1129", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1129", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_1392", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_505", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_as_real_91", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_91", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1130", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1130", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "convert_element_type_1393", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1131", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1132", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "view_1133", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1131", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_1126", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1126", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_511", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "einsum_default_417", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_514", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "permute_795", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1126", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_795", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "einsum_default_418", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_417", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "permute_796", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_796", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "dtype_cast_415", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_415", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wv", + "name": "alias_default_1586", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1132", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_1127", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1127", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_511", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "einsum_default_419", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_513", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "permute_799", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1127", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_799", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "einsum_default_420", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_418", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_420", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "add_256", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_419", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "permute_800", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_800", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "dtype_cast_416", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_416", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wk", + "name": "alias_default_1585", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1133", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention", + "name": "alias_default_1128", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1128", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_511", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "einsum_default_421", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_512", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "permute_803", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1128", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_803", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "einsum_default_422", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_256", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_422", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18", + "name": "add_257", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_421", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "permute_804", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_804", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "dtype_cast_417", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_417", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention.wq", + "name": "alias_default_1584", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_257", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_1406", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_507", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_1407", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_508", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_1408", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1406", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_1129", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1129", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1408", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_506", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1407", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_510", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_507", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_506", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_1130", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_507", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_1131", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1131", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_508", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_508", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "sum_85", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1131", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "div_60", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_85", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_509", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_509", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "sub_42", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_510", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_510", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1129", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1131", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "mul_511", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_511", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "sum_86", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_510", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_1409", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_86", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "convert_element_type_1410", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1409", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "add_258", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1410", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "dtype_cast_418", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_418", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.18.attention_norm", + "name": "alias_default_1591", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_258", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "alias_default_1132", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1132", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_505", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "einsum_default_423", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_506", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "permute_807", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1132", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_807", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "einsum_default_424", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_423", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "permute_808", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_808", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "dtype_cast_419", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_419", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "alias_default_1580", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_424", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w2", + "name": "alias_default_1133", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1133", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_502", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_512", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1133", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_504", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_513", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_512", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_1134", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1134", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_498", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "einsum_default_425", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_503", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "permute_811", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1134", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_811", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "einsum_default_426", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_425", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "permute_812", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_812", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "dtype_cast_420", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_420", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w3", + "name": "alias_default_1581", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_513", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "convert_element_type_1419", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_500", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "convert_element_type_1420", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1420", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_1135", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1135", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "neg_46", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "exp_46", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "add_259", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_259", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "reciprocal_14", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_14", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_514", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_514", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_1136", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1419", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1136", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_515", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1136", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "sub_43", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1135", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_516", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_516", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "add_260", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_515", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_260", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "mul_517", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_517", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "convert_element_type_1421", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1421", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward", + "name": "alias_default_1137", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_498", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "einsum_default_427", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_499", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "permute_815", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_815", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "einsum_default_428", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_426", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_428", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17", + "name": "add_261", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_427", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "permute_816", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_816", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "dtype_cast_421", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_421", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.feed_forward.w1", + "name": "alias_default_1579", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_261", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_1426", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_494", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_1427", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_495", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_1428", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1426", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_1138", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1138", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1428", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_518", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1427", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_497", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_519", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_518", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_1139", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_519", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_1140", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1140", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1139", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_520", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_520", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "sum_87", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1140", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "div_61", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_61", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_521", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1139", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_521", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "sub_44", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_497", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_522", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1138", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1140", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "mul_523", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_523", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "sum_88", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_522", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_1429", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_88", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "convert_element_type_1430", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1132", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1429", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "add_262", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1430", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "dtype_cast_422", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_422", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.ffn_norm", + "name": "alias_default_1583", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_262", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "alias_default_1141", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_492", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "einsum_default_429", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_493", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "permute_819", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_819", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "einsum_default_430", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_429", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "permute_820", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_820", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "dtype_cast_423", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_423", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wo", + "name": "alias_default_1578", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_430", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1148", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1148", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_821", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_821", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_488", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_489", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_490", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_491", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_159", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_160", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_14", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_330", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_331", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.sdpa", + "name": "getitem_332", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_332", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_822", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_331", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_823", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_330", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "permute_824", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_822", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1149", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "sum_89", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_89", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "squeeze_28", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_823", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1150", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1150", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "sum_90", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_90", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "squeeze_29", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_1435", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_824", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_1436", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1435", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1151", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1151", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_complex_92", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_487", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "_conj_28", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_28", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "clone_182", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_92", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_182", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "mul_524", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1436", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1152", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1152", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_complex_93", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_487", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "_conj_29", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_29", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "clone_183", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_93", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_183", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "mul_525", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_524", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_real_92", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_92", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1153", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1153", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_1437", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_525", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_as_real_93", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_93", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1154", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1154", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "convert_element_type_1438", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1155", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1437", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1156", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1438", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "view_1157", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1155", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_1142", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1142", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_483", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "einsum_default_431", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_486", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "permute_827", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1142", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_827", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "einsum_default_432", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_431", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "permute_828", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_828", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "dtype_cast_424", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_424", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wv", + "name": "alias_default_1577", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1156", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_1143", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1143", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_483", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "einsum_default_433", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_485", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "permute_831", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1143", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_831", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "einsum_default_434", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_432", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_434", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "add_263", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_433", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "permute_832", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_832", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "dtype_cast_425", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_425", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wk", + "name": "alias_default_1576", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1157", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention", + "name": "alias_default_1144", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1144", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_483", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "einsum_default_435", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_484", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "permute_835", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1144", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_835", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "einsum_default_436", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_263", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_436", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17", + "name": "add_264", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_435", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "permute_836", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_836", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "dtype_cast_426", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_426", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention.wq", + "name": "alias_default_1575", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_264", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_1451", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_479", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_1452", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_480", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_1453", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1451", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_1145", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1453", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_526", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1452", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_482", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_527", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_526", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_1146", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_527", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_1147", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_528", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_528", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "sum_91", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "div_62", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_529", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_529", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "sub_45", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_45", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_482", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_530", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "mul_531", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_531", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "sum_92", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_530", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_1454", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_92", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "convert_element_type_1455", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1454", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "add_265", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1455", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "dtype_cast_427", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_427", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.17.attention_norm", + "name": "alias_default_1582", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_265", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "alias_default_1148", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1148", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_477", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "einsum_default_437", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_478", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "permute_839", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1148", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_839", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "einsum_default_438", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_437", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "permute_840", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_840", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "dtype_cast_428", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_428", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "alias_default_1571", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_438", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w2", + "name": "alias_default_1149", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_474", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_532", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_476", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_533", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_532", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_1150", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1150", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_470", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "einsum_default_439", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_475", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "permute_843", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1150", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_843", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "einsum_default_440", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_439", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "permute_844", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_844", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "dtype_cast_429", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_429", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w3", + "name": "alias_default_1572", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_533", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "convert_element_type_1464", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_472", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "convert_element_type_1465", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1465", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_1151", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1151", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "neg_47", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "exp_47", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "add_266", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_266", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "reciprocal_15", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_15", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_534", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_534", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_1152", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1464", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1152", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_535", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1152", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "sub_46", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1151", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_536", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_536", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "add_267", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_535", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_267", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "mul_537", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_537", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "convert_element_type_1466", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1466", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward", + "name": "alias_default_1153", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1153", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_470", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "einsum_default_441", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_471", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "permute_847", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1153", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_847", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "einsum_default_442", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_440", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_442", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16", + "name": "add_268", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_441", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "permute_848", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_848", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "dtype_cast_430", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_430", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.feed_forward.w1", + "name": "alias_default_1570", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_268", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_1471", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_466", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_1472", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_467", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_1473", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1471", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_1154", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1473", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_538", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1472", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_469", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_539", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_538", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_1155", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_539", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_1156", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1155", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_540", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_540", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "sum_93", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "div_63", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_93", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_541", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1155", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_541", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "sub_47", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_47", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_469", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_542", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "mul_543", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_543", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "sum_94", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_542", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_1474", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_94", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "convert_element_type_1475", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1148", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1474", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "add_269", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1475", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "dtype_cast_431", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_431", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.ffn_norm", + "name": "alias_default_1574", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "alias_default_1157", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_464", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "einsum_default_443", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_465", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "permute_851", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_851", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "einsum_default_444", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_443", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "permute_852", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_852", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "dtype_cast_432", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_432", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wo", + "name": "alias_default_1569", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_444", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1172", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1172", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_853", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_853", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_460", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_461", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_462", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_463", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_150", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_15", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_333", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_334", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.sdpa", + "name": "getitem_335", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_335", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_854", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_334", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_855", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "permute_856", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_854", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1173", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1173", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "sum_95", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_95", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "squeeze_30", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_855", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1174", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1174", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "sum_96", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_96", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "squeeze_31", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_1480", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_856", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_1481", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1480", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1175", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1175", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_complex_94", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_459", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "_conj_30", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_30", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "clone_190", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_94", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_190", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "mul_544", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1481", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1176", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1176", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_complex_95", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_459", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "_conj_31", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_31", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "clone_191", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_95", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_191", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "mul_545", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_544", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_real_94", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_94", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1177", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1177", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_1482", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_545", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_as_real_95", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_95", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1178", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1178", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "convert_element_type_1483", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1179", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1482", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1180", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1483", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "view_1181", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1179", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_1158", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1158", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_455", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "einsum_default_445", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_458", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "permute_859", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1158", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_859", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "einsum_default_446", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_445", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "permute_860", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_860", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "dtype_cast_433", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_433", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wv", + "name": "alias_default_1568", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1180", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_1159", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1159", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_455", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "einsum_default_447", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_457", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "permute_863", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1159", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_863", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "einsum_default_448", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_446", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_448", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "add_270", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_447", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "permute_864", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_864", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "dtype_cast_434", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_434", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wk", + "name": "alias_default_1567", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1181", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention", + "name": "alias_default_1160", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1160", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_455", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "einsum_default_449", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_456", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "permute_867", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1160", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_867", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "einsum_default_450", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_270", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_450", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16", + "name": "add_271", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_449", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "permute_868", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_868", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "dtype_cast_435", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_435", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention.wq", + "name": "alias_default_1566", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_271", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_1496", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_451", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_1497", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_452", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_1498", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1496", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_1161", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1161", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1498", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_546", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1497", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_454", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_547", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_546", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_1162", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_547", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_1163", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1163", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1162", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_548", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_548", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "sum_97", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1163", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "div_64", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_97", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_549", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1162", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_549", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "sub_48", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_48", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_454", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_550", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1161", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1163", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "mul_551", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_551", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "sum_98", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_550", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_1499", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_98", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "convert_element_type_1500", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1499", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "add_272", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1500", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "dtype_cast_436", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_436", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.16.attention_norm", + "name": "alias_default_1573", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_272", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "alias_default_1164", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1164", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_449", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "einsum_default_451", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_450", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "permute_871", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1164", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_871", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "einsum_default_452", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_451", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "permute_872", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_872", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "dtype_cast_437", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_437", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "alias_default_1562", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_452", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w2", + "name": "alias_default_1165", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_446", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_552", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1165", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_448", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_553", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_552", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_1166", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1166", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_442", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "einsum_default_453", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_447", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "permute_875", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1166", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_875", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "einsum_default_454", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_453", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "permute_876", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_876", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "dtype_cast_438", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_438", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w3", + "name": "alias_default_1563", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_553", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "convert_element_type_1509", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_444", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "convert_element_type_1510", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1510", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_1167", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1167", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "neg_48", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "exp_48", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "add_273", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_273", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "reciprocal_16", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_16", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_554", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_554", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_1168", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1509", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_555", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "sub_49", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1167", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_556", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_556", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "add_274", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_555", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_274", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "mul_557", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_557", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "convert_element_type_1511", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1511", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward", + "name": "alias_default_1169", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1169", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_442", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "einsum_default_455", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_443", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "permute_879", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1169", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_879", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "einsum_default_456", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_454", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_456", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15", + "name": "add_275", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_455", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "permute_880", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_880", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "dtype_cast_439", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_439", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.feed_forward.w1", + "name": "alias_default_1561", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_275", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_1516", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_438", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_1517", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_439", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_1518", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1516", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_1170", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1170", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1518", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_558", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1517", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_441", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_559", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_558", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_1171", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_559", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_1172", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_560", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_560", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "sum_99", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "div_65", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_65", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_99", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_561", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_561", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "sub_50", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_50", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_441", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_562", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1170", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "mul_563", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_563", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "sum_100", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_562", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_1519", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_100", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "convert_element_type_1520", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1164", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1519", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "add_276", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1520", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "dtype_cast_440", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_440", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.ffn_norm", + "name": "alias_default_1565", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_276", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "alias_default_1173", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_436", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "einsum_default_457", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_437", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "permute_883", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_883", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "einsum_default_458", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_457", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "permute_884", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_884", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "dtype_cast_441", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_441", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wo", + "name": "alias_default_1560", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_458", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1196", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1196", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_885", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_885", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_432", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_433", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_434", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_435", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_136", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_141", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_142", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_16", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_336", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_337", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.sdpa", + "name": "getitem_338", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_338", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_886", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_337", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_887", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_336", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "permute_888", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_886", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1197", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "sum_101", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "squeeze_32", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_887", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1198", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1198", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "sum_102", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "squeeze_33", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_33", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_1525", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_888", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_1526", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1525", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1199", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1199", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_complex_96", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_431", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "_conj_32", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_32", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "clone_198", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_96", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_198", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "mul_564", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1526", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1200", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1200", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_complex_97", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_431", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "_conj_33", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_33", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "clone_199", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_97", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_199", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "mul_565", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_564", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_real_96", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_96", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1201", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1201", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_1527", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_565", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_as_real_97", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_97", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1202", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1202", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "convert_element_type_1528", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_32", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1203", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1527", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1204", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1528", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "view_1205", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1203", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_1174", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1174", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_427", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "einsum_default_459", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_430", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "permute_891", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1174", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_891", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "einsum_default_460", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_459", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "permute_892", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_892", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "dtype_cast_442", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_442", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wv", + "name": "alias_default_1559", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1204", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_1175", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1175", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_427", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "einsum_default_461", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_429", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "permute_895", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1175", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_895", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "einsum_default_462", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_460", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_462", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "add_277", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_461", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "permute_896", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_896", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "dtype_cast_443", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_443", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wk", + "name": "alias_default_1558", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1205", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention", + "name": "alias_default_1176", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1176", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_427", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "einsum_default_463", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_428", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "permute_899", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1176", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_899", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "einsum_default_464", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_277", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_464", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15", + "name": "add_278", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_463", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "permute_900", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_900", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "dtype_cast_444", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_444", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention.wq", + "name": "alias_default_1557", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_278", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_1541", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_423", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_1542", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_424", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_1543", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1541", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_1177", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1177", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1543", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_566", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1542", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_426", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_567", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_566", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_1178", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_567", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_1179", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1179", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1178", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_568", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_568", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "sum_103", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1179", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "div_66", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_66", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_103", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_569", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1178", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_569", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "sub_51", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_51", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_426", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_570", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1177", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1179", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "mul_571", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_571", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "sum_104", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_570", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_1544", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_104", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "convert_element_type_1545", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1544", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "add_279", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1545", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "dtype_cast_445", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_445", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.15.attention_norm", + "name": "alias_default_1564", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_279", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "alias_default_1180", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1180", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_421", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "einsum_default_465", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_422", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "permute_903", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1180", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_903", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "einsum_default_466", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_465", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "permute_904", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_904", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "dtype_cast_446", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_446", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "alias_default_1553", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_466", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w2", + "name": "alias_default_1181", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1181", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_418", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_572", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1181", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_420", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_573", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_572", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_1182", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1182", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_414", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "einsum_default_467", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_419", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "permute_907", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1182", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_907", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "einsum_default_468", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_467", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "permute_908", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_908", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "dtype_cast_447", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_447", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w3", + "name": "alias_default_1554", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_573", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "convert_element_type_1554", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_416", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "convert_element_type_1555", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1555", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_1183", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1183", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "neg_49", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "exp_49", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "add_280", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_280", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "reciprocal_17", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_17", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_574", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_574", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_1184", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1554", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1184", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_575", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1184", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "sub_52", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1183", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_576", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_576", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "add_281", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_575", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_281", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "mul_577", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_577", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "convert_element_type_1556", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1556", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward", + "name": "alias_default_1185", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1185", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_414", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "einsum_default_469", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_415", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "permute_911", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1185", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_911", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "einsum_default_470", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_468", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_470", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14", + "name": "add_282", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_469", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "permute_912", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_912", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "dtype_cast_448", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_448", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.feed_forward.w1", + "name": "alias_default_1552", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_282", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_1561", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_410", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_1562", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_411", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_1563", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1561", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_1186", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1563", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_578", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1562", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_413", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_579", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_578", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_1187", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_579", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_1188", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1188", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1187", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_580", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_580", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "sum_105", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1188", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "div_67", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_67", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_105", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_581", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1187", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_581", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "sub_53", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_53", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_413", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_582", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1188", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "mul_583", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_583", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "sum_106", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_582", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_1564", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_106", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "convert_element_type_1565", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1180", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1564", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "add_283", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1565", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "dtype_cast_449", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_449", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.ffn_norm", + "name": "alias_default_1556", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "alias_default_1189", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_408", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "einsum_default_471", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_409", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "permute_915", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_915", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "einsum_default_472", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_471", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "permute_916", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_916", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "dtype_cast_450", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_450", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wo", + "name": "alias_default_1551", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_472", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1220", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1220", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_917", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_917", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_404", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_405", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_406", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_407", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_132", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_133", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_17", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_339", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_340", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_17", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.sdpa", + "name": "getitem_341", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_341", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_918", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_340", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_919", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "permute_920", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_918", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1221", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1221", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "sum_107", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "squeeze_34", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_919", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1222", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1222", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "sum_108", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "squeeze_35", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_35", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_1570", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_920", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_1571", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1570", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1223", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1223", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_complex_98", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_403", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "_conj_34", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_34", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "clone_206", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_98", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_206", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "mul_584", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1571", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1224", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_complex_99", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_403", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "_conj_35", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_35", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "clone_207", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_99", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_207", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "mul_585", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_584", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_real_98", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_98", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1225", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1225", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_1572", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_585", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_as_real_99", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_99", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1226", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1226", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "convert_element_type_1573", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_34", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1227", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1572", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1228", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1573", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "view_1229", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1227", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_1190", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1190", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_399", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "einsum_default_473", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_402", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "permute_923", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1190", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_923", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "einsum_default_474", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_473", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "permute_924", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_924", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "dtype_cast_451", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_451", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wv", + "name": "alias_default_1550", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1228", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_1191", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1191", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_399", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "einsum_default_475", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_401", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "permute_927", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1191", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_927", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "einsum_default_476", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_474", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_476", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "add_284", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_475", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "permute_928", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_928", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "dtype_cast_452", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_452", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wk", + "name": "alias_default_1549", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1229", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention", + "name": "alias_default_1192", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1192", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_399", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "einsum_default_477", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_400", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "permute_931", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1192", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_931", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "einsum_default_478", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_284", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_478", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14", + "name": "add_285", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_477", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "permute_932", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_932", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "dtype_cast_453", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_453", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention.wq", + "name": "alias_default_1548", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_285", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_1586", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_395", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_1587", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_396", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_1588", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1586", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_1193", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1193", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1588", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_586", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1587", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_398", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_587", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_586", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_1194", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_587", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_1195", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1195", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1194", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_588", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_588", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "sum_109", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1195", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "div_68", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_68", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_589", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1194", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_589", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "sub_54", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_54", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_398", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_590", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1193", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1195", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "mul_591", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_591", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "sum_110", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_590", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_1589", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_110", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "convert_element_type_1590", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1589", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "add_286", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1590", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "dtype_cast_454", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_454", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.14.attention_norm", + "name": "alias_default_1555", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_286", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "alias_default_1196", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1196", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "einsum_default_479", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_394", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "permute_935", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1196", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_935", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "einsum_default_480", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_479", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "permute_936", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_936", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "dtype_cast_455", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_455", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "alias_default_1544", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_480", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w2", + "name": "alias_default_1197", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_592", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_593", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_592", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_1198", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1198", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_386", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "einsum_default_481", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_391", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "permute_939", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1198", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_939", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "einsum_default_482", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_481", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "permute_940", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_940", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "dtype_cast_456", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_456", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w3", + "name": "alias_default_1545", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_593", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "convert_element_type_1599", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_388", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "convert_element_type_1600", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1600", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_1199", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1199", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "neg_50", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "exp_50", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "add_287", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_287", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "reciprocal_18", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_18", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_594", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_594", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_1200", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1599", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1200", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_595", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1200", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "sub_55", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1199", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_596", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_596", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "add_288", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_595", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_288", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "mul_597", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_597", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "convert_element_type_1601", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1601", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward", + "name": "alias_default_1201", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1201", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_386", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "einsum_default_483", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_387", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "permute_943", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1201", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_943", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "einsum_default_484", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_482", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_484", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13", + "name": "add_289", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_483", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "permute_944", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_944", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "dtype_cast_457", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_457", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.feed_forward.w1", + "name": "alias_default_1543", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_289", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_1606", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_1607", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_383", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_1608", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1606", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_1202", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1608", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_598", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1607", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_385", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_599", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_598", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_1203", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_599", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_1204", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1204", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1203", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_600", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_600", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "sum_111", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1204", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "div_69", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_111", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_601", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1203", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_601", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "sub_56", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_56", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_385", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_602", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1204", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "mul_603", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_603", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "sum_112", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_602", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_1609", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_112", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "convert_element_type_1610", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1196", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1609", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "add_290", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1610", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "dtype_cast_458", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_458", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.ffn_norm", + "name": "alias_default_1547", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_290", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "alias_default_1205", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1205", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "einsum_default_485", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_381", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "permute_947", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1205", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_947", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "einsum_default_486", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_485", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "permute_948", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_948", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "dtype_cast_459", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_459", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wo", + "name": "alias_default_1542", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_486", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1244", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1244", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_949", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_949", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_376", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_377", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_379", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_118", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_123", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_124", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_18", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_342", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_343", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.sdpa", + "name": "getitem_344", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_344", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_950", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_343", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_951", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_342", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "permute_952", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_950", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1245", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1245", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "sum_113", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "squeeze_36", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_951", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1246", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1246", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "sum_114", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "squeeze_37", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_37", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_1615", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_952", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_1616", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1615", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1247", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1247", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_complex_100", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_375", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "_conj_36", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_36", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "clone_214", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_100", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_214", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "mul_604", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1616", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1248", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1248", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_complex_101", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_375", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "_conj_37", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_37", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "clone_215", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_215", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "mul_605", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_604", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_real_100", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_100", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1249", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1249", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_1617", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_605", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_as_real_101", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_101", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1250", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1250", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "convert_element_type_1618", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_36", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1251", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1617", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1252", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1618", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "view_1253", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1251", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_1206", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1206", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_371", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "einsum_default_487", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_374", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "permute_955", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1206", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_955", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "einsum_default_488", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_487", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "permute_956", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_956", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "dtype_cast_460", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_460", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wv", + "name": "alias_default_1541", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1252", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_1207", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1207", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_371", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "einsum_default_489", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_373", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "permute_959", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1207", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_959", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "einsum_default_490", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_488", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_490", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "add_291", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_489", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "permute_960", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_960", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "dtype_cast_461", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_461", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wk", + "name": "alias_default_1540", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1253", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention", + "name": "alias_default_1208", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1208", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_371", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "einsum_default_491", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_372", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "permute_963", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1208", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_963", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "einsum_default_492", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_492", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13", + "name": "add_292", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_491", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "permute_964", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_964", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "dtype_cast_462", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_462", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention.wq", + "name": "alias_default_1539", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_292", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_1631", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_367", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_1632", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_368", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_1633", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1631", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_1209", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1209", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1633", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_606", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1632", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_607", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_606", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_1210", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_607", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_1211", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1211", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1210", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_608", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_608", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "sum_115", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1211", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "div_70", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_609", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1210", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_609", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "sub_57", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_57", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_610", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1209", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1211", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "mul_611", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_611", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "sum_116", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_610", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_1634", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_116", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "convert_element_type_1635", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1205", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1634", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "add_293", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1635", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "dtype_cast_463", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_463", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.13.attention_norm", + "name": "alias_default_1546", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_293", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "alias_default_1212", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1212", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_365", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "einsum_default_493", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_366", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "permute_967", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1212", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_967", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "einsum_default_494", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_493", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "permute_968", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_968", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "dtype_cast_464", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_464", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "alias_default_1535", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_494", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w2", + "name": "alias_default_1213", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1213", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_362", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_612", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1213", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_364", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_613", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_612", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_1214", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1214", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_358", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "einsum_default_495", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_363", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "permute_971", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1214", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_971", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "einsum_default_496", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_495", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "permute_972", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_972", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "dtype_cast_465", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_465", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w3", + "name": "alias_default_1536", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_613", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "convert_element_type_1644", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_360", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "convert_element_type_1645", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1645", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_1215", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1215", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "neg_51", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "exp_51", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "add_294", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_294", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "reciprocal_19", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_19", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_614", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_614", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_1216", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1644", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1216", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_615", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1216", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "sub_58", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1215", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_616", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_616", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "add_295", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_615", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_295", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "mul_617", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_617", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "convert_element_type_1646", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1646", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward", + "name": "alias_default_1217", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1217", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_358", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "einsum_default_497", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_359", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "permute_975", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1217", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_975", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "einsum_default_498", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_496", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_498", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12", + "name": "add_296", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_497", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "permute_976", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_976", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "dtype_cast_466", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_466", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.feed_forward.w1", + "name": "alias_default_1534", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_296", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_1651", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_1652", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_355", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_1653", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1651", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_1218", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1218", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1653", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_618", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1652", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_357", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_619", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_618", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_1219", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_619", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_1220", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1220", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1219", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_620", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_620", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "sum_117", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1220", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "div_71", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_117", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_621", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1219", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_621", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "sub_59", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_357", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_622", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1218", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1220", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "mul_623", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_623", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "sum_118", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_622", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_1654", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_118", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "convert_element_type_1655", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1212", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1654", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "add_297", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1655", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "dtype_cast_467", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_467", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.ffn_norm", + "name": "alias_default_1538", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_297", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "alias_default_1221", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1221", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_352", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "einsum_default_499", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_353", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "permute_979", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1221", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_979", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "einsum_default_500", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_499", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "permute_980", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_980", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "dtype_cast_468", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_468", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wo", + "name": "alias_default_1533", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_500", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1268", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1268", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_981", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_981", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_350", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_351", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_114", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_115", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_19", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_345", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_346", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.sdpa", + "name": "getitem_347", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_347", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_982", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_346", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_983", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_345", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "permute_984", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_982", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1269", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1269", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "sum_119", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_119", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "squeeze_38", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_983", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1270", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1270", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "sum_120", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "squeeze_39", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_39", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_1660", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_984", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_1661", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1660", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1271", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1271", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_complex_102", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_347", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "_conj_38", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_38", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "clone_222", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_222", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "mul_624", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1661", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1272", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1272", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_complex_103", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_347", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "_conj_39", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_39", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "clone_223", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_223", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "mul_625", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_624", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_real_102", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_102", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1273", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1273", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_1662", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_625", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_as_real_103", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_103", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1274", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1274", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "convert_element_type_1663", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_38", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1275", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1662", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1276", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1663", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "view_1277", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1275", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_1222", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1222", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_343", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "einsum_default_501", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_346", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "permute_987", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1222", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_987", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "einsum_default_502", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_501", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "permute_988", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_988", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "dtype_cast_469", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_469", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wv", + "name": "alias_default_1532", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1276", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_1223", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1223", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_343", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "einsum_default_503", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_345", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "permute_991", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1223", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_991", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "einsum_default_504", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_502", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_504", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "add_298", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_503", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "permute_992", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_992", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "dtype_cast_470", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_470", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wk", + "name": "alias_default_1531", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1277", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention", + "name": "alias_default_1224", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_343", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "einsum_default_505", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_344", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "permute_995", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1224", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_995", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "einsum_default_506", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_506", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12", + "name": "add_299", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_505", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "permute_996", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_996", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "dtype_cast_471", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_471", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention.wq", + "name": "alias_default_1530", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_299", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_1676", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_1677", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_340", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_1678", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1676", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_1225", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1225", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1678", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_626", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1677", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_342", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_627", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_626", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_1226", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_627", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_1227", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1226", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_628", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_628", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "sum_121", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "div_72", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_121", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_629", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1226", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_629", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "sub_60", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_60", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_342", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_630", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1225", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "mul_631", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_631", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "sum_122", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_630", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_1679", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_122", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "convert_element_type_1680", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1221", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1679", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "add_300", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1680", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "dtype_cast_472", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_472", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.12.attention_norm", + "name": "alias_default_1537", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_300", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "alias_default_1228", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1228", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_337", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "einsum_default_507", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_338", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "permute_999", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1228", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_999", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "einsum_default_508", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_507", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "permute_1000", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1000", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "dtype_cast_473", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_473", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "alias_default_1526", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_508", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w2", + "name": "alias_default_1229", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1229", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_334", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_632", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1229", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_336", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_633", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_632", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_1230", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1230", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_330", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "einsum_default_509", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_335", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "permute_1003", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1230", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1003", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "einsum_default_510", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_509", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "permute_1004", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1004", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "dtype_cast_474", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_474", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w3", + "name": "alias_default_1527", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_633", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "convert_element_type_1689", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_332", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "convert_element_type_1690", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1690", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_1231", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1231", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "neg_52", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "exp_52", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "add_301", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_301", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "reciprocal_20", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_20", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_634", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_634", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_1232", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1689", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1232", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_635", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1232", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "sub_61", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1231", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_636", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_636", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "add_302", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_635", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_302", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "mul_637", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_637", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "convert_element_type_1691", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1691", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward", + "name": "alias_default_1233", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1233", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_330", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "einsum_default_511", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_331", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "permute_1007", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1233", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1007", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "einsum_default_512", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_510", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_512", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11", + "name": "add_303", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_511", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "permute_1008", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1008", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "dtype_cast_475", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_475", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.feed_forward.w1", + "name": "alias_default_1525", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_303", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_1696", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_1697", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_327", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_1698", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1696", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_1234", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1234", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1698", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_638", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1697", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_329", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_639", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_638", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_1235", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_639", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_1236", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1235", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_640", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_640", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "sum_123", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "div_73", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_73", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_123", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_641", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1235", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_641", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "sub_62", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_329", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_642", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1234", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "mul_643", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_643", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "sum_124", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_642", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_1699", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_124", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "convert_element_type_1700", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1228", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1699", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "add_304", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1700", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "dtype_cast_476", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_476", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.ffn_norm", + "name": "alias_default_1529", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_304", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "alias_default_1237", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1237", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_324", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "einsum_default_513", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_325", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "permute_1011", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1237", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1011", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "einsum_default_514", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_513", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "permute_1012", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1012", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "dtype_cast_477", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_477", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wo", + "name": "alias_default_1524", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_514", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1292", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1292", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_1013", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1013", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_320", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_321", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_100", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_105", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_106", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_20", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_348", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_349", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_20", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.sdpa", + "name": "getitem_350", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_350", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_1014", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_1015", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "permute_1016", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1014", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1293", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1293", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "sum_125", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_125", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "squeeze_40", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1015", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1294", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1294", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "sum_126", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_126", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "squeeze_41", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_41", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_1705", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1016", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_1706", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1705", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1295", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1295", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_complex_104", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_319", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "_conj_40", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_40", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "clone_230", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_230", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "mul_644", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1706", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1296", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1296", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_complex_105", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_319", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "_conj_41", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_41", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "clone_231", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_231", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "mul_645", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_644", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_real_104", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_104", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1297", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1297", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_1707", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_645", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_as_real_105", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_105", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1298", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1298", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "convert_element_type_1708", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_40", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1299", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1707", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1300", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1708", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "view_1301", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1299", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_1238", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1238", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_315", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "einsum_default_515", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_318", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "permute_1019", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1238", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1019", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "einsum_default_516", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_515", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "permute_1020", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1020", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "dtype_cast_478", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_478", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wv", + "name": "alias_default_1523", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1300", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_1239", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1239", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_315", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "einsum_default_517", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_317", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "permute_1023", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1239", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1023", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "einsum_default_518", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_516", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_518", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "add_305", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_517", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "permute_1024", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1024", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "dtype_cast_479", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_479", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wk", + "name": "alias_default_1522", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1301", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention", + "name": "alias_default_1240", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1240", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_315", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "einsum_default_519", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_316", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "permute_1027", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1240", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1027", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "einsum_default_520", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_305", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_520", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11", + "name": "add_306", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_519", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "permute_1028", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1028", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "dtype_cast_480", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_480", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention.wq", + "name": "alias_default_1521", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_306", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_1721", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_1722", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_312", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_1723", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1721", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_1241", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1241", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1723", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_646", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1722", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_314", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_647", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_646", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_1242", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_647", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_1243", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1243", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_648", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_648", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "sum_127", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1243", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "div_74", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_649", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_649", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "sub_63", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_63", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_314", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_650", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1241", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1243", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "mul_651", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_651", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "sum_128", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_650", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_1724", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_128", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "convert_element_type_1725", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1237", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1724", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "add_307", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1725", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "dtype_cast_481", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_481", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.11.attention_norm", + "name": "alias_default_1528", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_307", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "alias_default_1244", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1244", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_309", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "einsum_default_521", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_310", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "permute_1031", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1244", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1031", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "einsum_default_522", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_521", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "permute_1032", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1032", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "dtype_cast_482", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_482", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "alias_default_1517", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_522", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w2", + "name": "alias_default_1245", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1245", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_306", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_652", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1245", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_308", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_653", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_652", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_1246", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1246", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_302", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "einsum_default_523", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_307", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "permute_1035", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1246", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1035", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "einsum_default_524", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_523", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "permute_1036", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1036", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "dtype_cast_483", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_483", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w3", + "name": "alias_default_1518", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_653", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "convert_element_type_1734", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_304", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "convert_element_type_1735", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1735", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_1247", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1247", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "neg_53", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "exp_53", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "add_308", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_308", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "reciprocal_21", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_21", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_654", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_654", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_1248", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1734", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1248", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_655", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1248", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "sub_64", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1247", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_64", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_656", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_656", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "add_309", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_655", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_309", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "mul_657", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_657", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "convert_element_type_1736", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1736", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward", + "name": "alias_default_1249", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1249", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_302", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "einsum_default_525", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_303", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "permute_1039", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1249", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1039", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "einsum_default_526", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_524", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_526", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10", + "name": "add_310", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_525", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "permute_1040", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1040", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "dtype_cast_484", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_484", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.feed_forward.w1", + "name": "alias_default_1516", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_310", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_1741", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_1742", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_299", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_1743", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1741", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_1250", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1250", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1743", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_658", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1742", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_659", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_658", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_1251", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_659", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_1252", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1252", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1251", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_660", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_660", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "sum_129", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1252", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "div_75", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_75", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_129", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_661", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1251", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_661", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "sub_65", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_65", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_662", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1250", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1252", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "mul_663", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_663", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "sum_130", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_662", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_1744", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_130", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "convert_element_type_1745", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1244", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1744", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "add_311", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1745", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "dtype_cast_485", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_485", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.ffn_norm", + "name": "alias_default_1520", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_311", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "alias_default_1253", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1253", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_296", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "einsum_default_527", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_297", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "permute_1043", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1253", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1043", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "einsum_default_528", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_527", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "permute_1044", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1044", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "dtype_cast_486", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_486", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wo", + "name": "alias_default_1515", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_528", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1316", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1316", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_1045", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1045", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_292", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_293", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_294", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_295", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_96", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_97", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_21", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_351", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_352", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.sdpa", + "name": "getitem_353", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_353", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_1046", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_352", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_1047", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_351", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "permute_1048", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1046", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1317", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1317", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "sum_131", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_131", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "squeeze_42", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1047", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1318", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1318", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "sum_132", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_132", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "squeeze_43", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_43", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_1750", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1048", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_1751", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1750", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1319", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1319", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_complex_106", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_291", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "_conj_42", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_42", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "clone_238", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_238", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "mul_664", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1751", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1320", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1320", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_complex_107", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_291", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "_conj_43", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_43", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "clone_239", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_239", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "mul_665", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_664", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_real_106", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_106", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1321", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1321", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_1752", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_665", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_as_real_107", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_107", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1322", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1322", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "convert_element_type_1753", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_42", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1323", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1752", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1324", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1753", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "view_1325", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1323", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_1254", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1254", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_287", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "einsum_default_529", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_290", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "permute_1051", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1254", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1051", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "einsum_default_530", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_529", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "permute_1052", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1052", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "dtype_cast_487", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_487", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wv", + "name": "alias_default_1514", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1324", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_1255", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1255", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_287", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "einsum_default_531", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_289", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "permute_1055", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1255", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1055", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "einsum_default_532", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_530", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_532", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "add_312", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_531", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "permute_1056", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1056", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "dtype_cast_488", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_488", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wk", + "name": "alias_default_1513", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1325", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention", + "name": "alias_default_1256", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1256", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_287", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "einsum_default_533", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_288", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "permute_1059", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1256", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1059", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "einsum_default_534", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_312", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_534", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10", + "name": "add_313", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_533", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "permute_1060", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1060", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "dtype_cast_489", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_489", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention.wq", + "name": "alias_default_1512", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_313", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_1766", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_1767", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_284", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_1768", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1766", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_1257", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1257", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1768", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_666", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1767", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_286", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_667", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_666", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_1258", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_667", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_1259", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1259", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_668", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_668", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "sum_133", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1259", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "div_76", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_76", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_133", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_669", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_669", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "sub_66", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_66", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_286", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_670", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1257", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1259", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "mul_671", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_671", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "sum_134", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_670", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_1769", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_134", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "convert_element_type_1770", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1253", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1769", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "add_314", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1770", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "dtype_cast_490", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_490", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.10.attention_norm", + "name": "alias_default_1519", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_314", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "alias_default_1260", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1260", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_281", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "einsum_default_535", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_282", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "permute_1063", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1260", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1063", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "einsum_default_536", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_535", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "permute_1064", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1064", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "dtype_cast_491", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_491", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "alias_default_1508", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_536", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w2", + "name": "alias_default_1261", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1261", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_278", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_672", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1261", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_280", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_673", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_672", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_1262", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1262", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_274", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "einsum_default_537", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_279", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "permute_1067", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1262", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1067", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "einsum_default_538", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_537", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "permute_1068", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1068", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "dtype_cast_492", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_492", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w3", + "name": "alias_default_1509", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_673", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "convert_element_type_1779", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_276", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "convert_element_type_1780", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1780", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_1263", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1263", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "neg_54", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "exp_54", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "add_315", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_315", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "reciprocal_22", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_22", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_674", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_674", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_1264", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1779", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1264", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_675", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1264", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "sub_67", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1263", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_67", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_676", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_676", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "add_316", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_675", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_316", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "mul_677", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_677", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "convert_element_type_1781", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1781", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward", + "name": "alias_default_1265", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1265", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_274", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "einsum_default_539", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_275", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "permute_1071", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1265", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1071", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "einsum_default_540", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_538", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_540", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9", + "name": "add_317", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_539", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "permute_1072", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1072", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "dtype_cast_493", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_493", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.feed_forward.w1", + "name": "alias_default_1507", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_317", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_1786", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_270", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_1787", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_271", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_1788", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1786", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_1266", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1788", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_678", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1787", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_273", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_679", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_678", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_1267", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_679", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_1268", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1267", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_680", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_680", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "sum_135", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "div_77", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_135", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_681", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1267", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_681", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "sub_68", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_68", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_273", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_682", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "mul_683", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_683", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "sum_136", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_682", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_1789", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_136", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "convert_element_type_1790", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1260", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1789", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "add_318", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1790", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "dtype_cast_494", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_494", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.ffn_norm", + "name": "alias_default_1511", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_318", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "alias_default_1269", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_268", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "einsum_default_541", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_269", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "permute_1075", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1075", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "einsum_default_542", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_541", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "permute_1076", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1076", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "dtype_cast_495", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_495", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wo", + "name": "alias_default_1506", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_542", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1340", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1340", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_1077", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1077", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_264", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_265", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_266", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_267", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_82", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_87", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_88", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_22", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_354", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_355", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_22", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.sdpa", + "name": "getitem_356", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_356", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_1078", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_355", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_1079", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "permute_1080", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1078", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1341", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1341", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "sum_137", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_137", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "squeeze_44", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1079", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1342", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1342", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "sum_138", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "squeeze_45", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_45", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_1795", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1080", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_1796", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1795", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1343", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1343", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_complex_108", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_263", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "_conj_44", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_44", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "clone_246", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_246", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "mul_684", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1796", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1344", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1344", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_complex_109", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_263", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "_conj_45", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_45", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "clone_247", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_247", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "mul_685", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_684", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_real_108", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1345", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1345", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_1797", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_685", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_as_real_109", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_109", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1346", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1346", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "convert_element_type_1798", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_44", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1347", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1797", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1348", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1798", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "view_1349", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1347", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_1270", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1270", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_259", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "einsum_default_543", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_262", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "permute_1083", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1270", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1083", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "einsum_default_544", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_543", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "permute_1084", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1084", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "dtype_cast_496", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_496", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wv", + "name": "alias_default_1505", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1348", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_1271", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1271", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_259", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "einsum_default_545", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_261", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "permute_1087", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1271", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1087", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "einsum_default_546", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_544", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_546", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "add_319", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_545", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "permute_1088", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1088", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "dtype_cast_497", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_497", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wk", + "name": "alias_default_1504", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1349", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention", + "name": "alias_default_1272", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1272", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_259", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "einsum_default_547", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_260", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "permute_1091", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1272", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1091", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "einsum_default_548", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_319", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_548", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9", + "name": "add_320", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_547", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "permute_1092", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1092", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "dtype_cast_498", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_498", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention.wq", + "name": "alias_default_1503", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_320", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_1811", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_255", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_1812", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_256", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_1813", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1811", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_1273", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1273", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1813", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_686", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1812", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_687", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_686", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_1274", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_687", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_1275", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1275", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1274", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_688", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_688", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "sum_139", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1275", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "div_78", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_78", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_139", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_689", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1274", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_689", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "sub_69", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_258", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_690", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1273", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1275", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "mul_691", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_691", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "sum_140", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_690", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_1814", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_140", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "convert_element_type_1815", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1814", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "add_321", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1815", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "dtype_cast_499", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_499", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.9.attention_norm", + "name": "alias_default_1510", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_321", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "alias_default_1276", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1276", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_253", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "einsum_default_549", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_254", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "permute_1095", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1276", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1095", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "einsum_default_550", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_549", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "permute_1096", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1096", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "dtype_cast_500", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_500", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "alias_default_1499", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_550", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w2", + "name": "alias_default_1277", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1277", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_250", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_692", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1277", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_252", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_693", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_692", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_1278", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1278", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_246", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "einsum_default_551", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_251", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "permute_1099", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1278", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1099", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "einsum_default_552", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_551", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "permute_1100", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1100", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "dtype_cast_501", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_501", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w3", + "name": "alias_default_1500", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_693", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "convert_element_type_1824", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_248", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "convert_element_type_1825", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1825", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_1279", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1279", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "neg_55", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "exp_55", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "add_322", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_322", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "reciprocal_23", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_23", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_694", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_694", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_1280", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1824", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1280", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_695", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1280", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "sub_70", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1279", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_70", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_696", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_696", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "add_323", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_695", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_323", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "mul_697", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_697", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "convert_element_type_1826", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1826", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward", + "name": "alias_default_1281", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1281", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_246", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "einsum_default_553", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_247", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "permute_1103", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1281", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1103", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "einsum_default_554", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_552", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_554", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8", + "name": "add_324", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_553", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "permute_1104", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1104", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "dtype_cast_502", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_502", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.feed_forward.w1", + "name": "alias_default_1498", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_324", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_1831", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_242", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_1832", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_243", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_1833", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1831", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_1282", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1282", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1833", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_698", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1832", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_245", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_699", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_698", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_1283", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_699", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_1284", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1284", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_700", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_700", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "sum_141", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1284", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "div_79", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_79", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_701", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1283", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_701", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "sub_71", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_245", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_702", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1282", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1284", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "mul_703", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_703", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "sum_142", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_702", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_1834", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_142", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "convert_element_type_1835", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1276", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1834", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "add_325", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1835", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "dtype_cast_503", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_503", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.ffn_norm", + "name": "alias_default_1502", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_325", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "alias_default_1285", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1285", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_240", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "einsum_default_555", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_241", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "permute_1107", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1285", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1107", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "einsum_default_556", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_555", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "permute_1108", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1108", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "dtype_cast_504", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_504", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wo", + "name": "alias_default_1497", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_556", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1364", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1364", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_1109", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1109", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_236", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_237", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_238", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_239", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_73", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_78", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_79", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_23", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_357", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_358", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_23", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.sdpa", + "name": "getitem_359", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_359", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_1110", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_358", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_1111", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_357", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "permute_1112", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1365", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1365", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "sum_143", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_143", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "squeeze_46", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1111", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1366", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1366", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "sum_144", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_144", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "squeeze_47", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_47", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_1840", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_1841", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1840", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1367", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1367", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_complex_110", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_235", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "_conj_46", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_46", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "clone_254", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_254", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "mul_704", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1841", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1368", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1368", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_complex_111", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_235", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "_conj_47", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_47", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "clone_255", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_111", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_255", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "mul_705", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_704", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_real_110", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1369", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1369", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_1842", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_705", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_as_real_111", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_111", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1370", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1370", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "convert_element_type_1843", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_46", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1371", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1842", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1372", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1843", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "view_1373", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1371", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_1286", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1286", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_231", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "einsum_default_557", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_234", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "permute_1115", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1286", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1115", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "einsum_default_558", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_557", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "permute_1116", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1116", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "dtype_cast_505", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_505", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wv", + "name": "alias_default_1496", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1372", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_1287", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1287", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_231", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "einsum_default_559", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_233", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "permute_1119", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1287", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1119", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "einsum_default_560", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_558", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_560", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "add_326", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_559", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "permute_1120", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1120", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "dtype_cast_506", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_506", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wk", + "name": "alias_default_1495", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1373", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention", + "name": "alias_default_1288", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1288", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_231", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "einsum_default_561", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_232", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "permute_1123", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1288", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1123", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "einsum_default_562", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_326", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_562", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8", + "name": "add_327", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_561", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "permute_1124", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1124", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "dtype_cast_507", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_507", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention.wq", + "name": "alias_default_1494", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_327", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_1856", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_227", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_1857", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_228", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_1858", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1856", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_1289", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1289", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1858", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_706", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1857", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_230", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_707", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_706", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_1290", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_707", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_1291", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1290", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_708", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_708", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "sum_145", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "div_80", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_80", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_145", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_709", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1290", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_709", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "sub_72", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_230", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_710", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1289", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1291", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "mul_711", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_711", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "sum_146", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_710", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_1859", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_146", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "convert_element_type_1860", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1285", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1859", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "add_328", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1860", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "dtype_cast_508", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_508", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.8.attention_norm", + "name": "alias_default_1501", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_328", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "alias_default_1292", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1292", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_225", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "einsum_default_563", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_226", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "permute_1127", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1292", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1127", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "einsum_default_564", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_563", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "permute_1128", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1128", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "dtype_cast_509", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_509", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "alias_default_1490", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_564", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w2", + "name": "alias_default_1293", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1293", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_222", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_712", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1293", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_224", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_713", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_712", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_1294", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1294", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_218", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "einsum_default_565", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_223", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "permute_1131", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1294", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1131", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "einsum_default_566", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_565", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "permute_1132", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1132", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "dtype_cast_510", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_510", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w3", + "name": "alias_default_1491", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_713", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "convert_element_type_1869", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_220", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "convert_element_type_1870", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1870", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_1295", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1295", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "neg_56", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "exp_56", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "add_329", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_329", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "reciprocal_24", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_714", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_714", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_1296", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1869", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1296", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_715", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1296", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "sub_73", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1295", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_73", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_716", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_716", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "add_330", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_715", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_330", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "mul_717", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_717", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "convert_element_type_1871", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1871", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward", + "name": "alias_default_1297", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1297", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_218", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "einsum_default_567", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_219", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "permute_1135", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1297", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1135", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "einsum_default_568", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_566", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_568", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7", + "name": "add_331", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_567", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "permute_1136", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1136", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "dtype_cast_511", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_511", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.feed_forward.w1", + "name": "alias_default_1489", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_331", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_1876", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_214", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_1877", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_215", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_1878", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1876", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_1298", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1878", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_718", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1877", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_217", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_719", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_718", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_1299", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_719", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_1300", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1299", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_720", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_720", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "sum_147", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "div_81", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_721", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1299", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_721", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "sub_74", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_217", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_722", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1298", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1300", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "mul_723", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_723", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "sum_148", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_722", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_1879", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_148", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "convert_element_type_1880", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1292", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1879", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "add_332", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1880", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "dtype_cast_512", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_512", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.ffn_norm", + "name": "alias_default_1493", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_332", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "alias_default_1301", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_212", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "einsum_default_569", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_213", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "permute_1139", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1139", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "einsum_default_570", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_569", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "permute_1140", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1140", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "dtype_cast_513", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_513", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wo", + "name": "alias_default_1488", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_570", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1388", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1388", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_1141", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1141", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_208", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_209", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_210", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_211", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_64", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_69", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_70", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_24", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_360", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_361", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_24", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.sdpa", + "name": "getitem_362", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_1142", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_361", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_1143", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_360", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "permute_1144", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1142", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1389", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1389", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "sum_149", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_149", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "squeeze_48", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1143", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1390", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "sum_150", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_150", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "squeeze_49", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_49", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_1885", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1144", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_1886", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1885", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1391", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1391", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_complex_112", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_207", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "_conj_48", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_48", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "clone_262", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_262", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "mul_724", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1886", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1392", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_complex_113", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_207", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "_conj_49", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_49", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "clone_263", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_263", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "mul_725", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_724", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_real_112", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1393", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_1887", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_725", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_as_real_113", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1394", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1394", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "convert_element_type_1888", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_48", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1395", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1887", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1396", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1888", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "view_1397", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1395", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_1302", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1302", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_203", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "einsum_default_571", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_206", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "permute_1147", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1302", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1147", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "einsum_default_572", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_571", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "permute_1148", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1148", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "dtype_cast_514", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_514", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wv", + "name": "alias_default_1487", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1396", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_1303", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1303", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_203", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "einsum_default_573", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_205", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "permute_1151", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1303", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "einsum_default_574", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_572", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_574", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "add_333", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_573", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "permute_1152", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1152", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "dtype_cast_515", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_515", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wk", + "name": "alias_default_1486", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1397", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention", + "name": "alias_default_1304", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1304", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_203", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "einsum_default_575", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_204", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "permute_1155", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1304", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1155", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "einsum_default_576", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_576", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7", + "name": "add_334", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_575", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "permute_1156", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1156", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "dtype_cast_516", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_516", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention.wq", + "name": "alias_default_1485", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_334", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_1901", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_199", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_1902", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_200", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_1903", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1901", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_1305", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1305", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1903", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_726", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1902", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_727", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_726", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_1306", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_727", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_1307", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1307", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1306", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_728", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_728", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "sum_151", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1307", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "div_82", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_82", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_151", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_729", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1306", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_729", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "sub_75", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_75", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_202", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_730", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1305", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1307", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "mul_731", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_731", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "sum_152", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_730", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_1904", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_152", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "convert_element_type_1905", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1904", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "add_335", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1905", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "dtype_cast_517", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_517", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.7.attention_norm", + "name": "alias_default_1492", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_335", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "alias_default_1308", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1308", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_197", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "einsum_default_577", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_198", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "permute_1159", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1308", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1159", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "einsum_default_578", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_577", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "permute_1160", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1160", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "dtype_cast_518", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_518", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "alias_default_1481", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_578", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w2", + "name": "alias_default_1309", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1309", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_194", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_732", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1309", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_196", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_733", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_732", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_1310", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1310", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_190", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "einsum_default_579", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_195", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "permute_1163", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1310", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1163", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "einsum_default_580", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_579", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "permute_1164", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1164", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "dtype_cast_519", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_519", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w3", + "name": "alias_default_1482", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_733", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "convert_element_type_1914", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_192", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "convert_element_type_1915", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1915", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_1311", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1311", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "neg_57", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "exp_57", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "add_336", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_336", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "reciprocal_25", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_25", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_734", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_734", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_1312", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1914", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1312", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_735", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1312", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "sub_76", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1311", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_76", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_736", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_736", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "add_337", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_735", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_337", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "mul_737", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_737", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "convert_element_type_1916", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1916", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward", + "name": "alias_default_1313", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1313", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_190", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "einsum_default_581", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_191", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "permute_1167", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1313", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1167", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "einsum_default_582", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_580", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_582", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6", + "name": "add_338", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_581", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "permute_1168", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1168", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "dtype_cast_520", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_520", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.feed_forward.w1", + "name": "alias_default_1480", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_338", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_1921", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_186", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_1922", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_187", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_1923", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1921", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_1314", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1314", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1923", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_738", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1922", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_739", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_738", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_1315", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_739", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_1316", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1316", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1315", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_740", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_740", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "sum_153", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1316", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "div_83", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_83", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_153", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_741", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1315", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_741", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "sub_77", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_742", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1314", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1316", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "mul_743", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_743", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "sum_154", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_742", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_1924", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_154", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "convert_element_type_1925", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1308", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1924", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "add_339", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1925", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "dtype_cast_521", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_521", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.ffn_norm", + "name": "alias_default_1484", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "alias_default_1317", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1317", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_184", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "einsum_default_583", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_185", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "permute_1171", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1317", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1171", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "einsum_default_584", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_583", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "permute_1172", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1172", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "dtype_cast_522", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_522", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wo", + "name": "alias_default_1479", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_584", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1412", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1412", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_1173", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1173", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_180", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_181", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_182", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_183", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_55", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_60", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_61", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_25", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_363", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_364", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_25", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.sdpa", + "name": "getitem_365", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_365", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_1174", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_364", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_1175", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_363", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "permute_1176", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1174", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1413", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1413", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "sum_155", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_155", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "squeeze_50", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1175", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1414", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1414", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "sum_156", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_156", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "squeeze_51", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_51", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_1930", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1176", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_1931", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1930", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1415", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1415", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_complex_114", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_179", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "_conj_50", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_50", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "clone_270", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_270", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "mul_744", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1931", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1416", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1416", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_complex_115", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_179", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "_conj_51", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_51", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "clone_271", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_115", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_271", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "mul_745", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_744", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_real_114", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_114", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1417", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1417", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_1932", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_745", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_as_real_115", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_115", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1418", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1418", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "convert_element_type_1933", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_50", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1419", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1932", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1420", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1933", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "view_1421", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1419", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_1318", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1318", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_175", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "einsum_default_585", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_178", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "permute_1179", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1318", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1179", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "einsum_default_586", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_585", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "permute_1180", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1180", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "dtype_cast_523", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_523", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wv", + "name": "alias_default_1478", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1420", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_1319", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1319", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_175", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "einsum_default_587", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_177", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "permute_1183", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1319", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1183", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "einsum_default_588", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_586", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_588", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "add_340", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_587", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "permute_1184", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1184", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "dtype_cast_524", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_524", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wk", + "name": "alias_default_1477", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1421", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention", + "name": "alias_default_1320", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1320", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_175", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "einsum_default_589", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_176", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "permute_1187", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1320", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1187", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "einsum_default_590", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_340", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_590", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6", + "name": "add_341", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_589", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "permute_1188", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1188", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "dtype_cast_525", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_525", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention.wq", + "name": "alias_default_1476", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_341", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_1946", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_1947", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_172", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_1948", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1946", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_1321", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1321", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1948", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_746", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1947", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_174", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_747", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_746", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_1322", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_747", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_1323", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_748", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_748", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "sum_157", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "div_84", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_84", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_157", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_749", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1322", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_749", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "sub_78", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_78", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_174", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_750", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1321", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1323", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "mul_751", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_751", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "sum_158", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_750", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_1949", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_158", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "convert_element_type_1950", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1317", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1949", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "add_342", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1950", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "dtype_cast_526", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_526", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.6.attention_norm", + "name": "alias_default_1483", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_342", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "alias_default_1324", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1324", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_169", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "einsum_default_591", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_170", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "permute_1191", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1324", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1191", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "einsum_default_592", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_591", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "permute_1192", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1192", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "dtype_cast_527", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_527", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "alias_default_1472", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_592", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w2", + "name": "alias_default_1325", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1325", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_166", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_752", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1325", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_753", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_752", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_1326", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1326", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_162", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "einsum_default_593", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_167", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "permute_1195", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1326", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1195", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "einsum_default_594", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_593", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "permute_1196", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1196", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "dtype_cast_528", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_528", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w3", + "name": "alias_default_1473", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_753", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "convert_element_type_1959", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_164", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "convert_element_type_1960", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1960", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_1327", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1327", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "neg_58", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "exp_58", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "add_343", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_343", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "reciprocal_26", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_754", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_754", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_1328", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1959", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1328", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_755", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1328", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "sub_79", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1327", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_79", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_756", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_756", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "add_344", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_755", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_344", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "mul_757", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_757", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "convert_element_type_1961", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1961", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward", + "name": "alias_default_1329", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1329", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_162", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "einsum_default_595", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_163", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "permute_1199", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1329", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1199", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "einsum_default_596", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_594", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_596", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5", + "name": "add_345", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_595", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "permute_1200", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1200", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "dtype_cast_529", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_529", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.feed_forward.w1", + "name": "alias_default_1471", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_345", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_1966", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_158", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_1967", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_159", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_1968", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1966", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_1330", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1330", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1968", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_758", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1967", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_161", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_759", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_758", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_1331", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_759", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_1332", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1332", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1331", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_760", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_760", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "sum_159", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1332", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "div_85", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_85", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_159", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_761", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1331", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_761", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "sub_80", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_80", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_161", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_762", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1330", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1332", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "mul_763", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_763", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "sum_160", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_762", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_1969", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_160", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "convert_element_type_1970", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1324", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1969", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "add_346", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1970", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "dtype_cast_530", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_530", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.ffn_norm", + "name": "alias_default_1475", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_346", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "alias_default_1333", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_156", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "einsum_default_597", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_157", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "permute_1203", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1203", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "einsum_default_598", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_597", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "permute_1204", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1204", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "dtype_cast_531", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_531", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wo", + "name": "alias_default_1470", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_598", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1436", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1436", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_1205", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1205", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_152", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_153", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_154", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_155", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_51", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_52", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_26", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_366", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_367", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_26", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.sdpa", + "name": "getitem_368", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_368", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_1206", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_367", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_1207", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_366", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "permute_1208", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1206", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1437", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1437", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "sum_161", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_161", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "squeeze_52", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1207", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1438", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1438", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "sum_162", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_162", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "squeeze_53", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_53", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_1975", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1208", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_1976", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1975", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1439", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1439", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_complex_116", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "_conj_52", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_52", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "clone_278", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_116", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_278", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "mul_764", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1976", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1440", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1440", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_complex_117", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_151", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "_conj_53", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_53", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "clone_279", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_117", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_279", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "mul_765", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_764", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_real_116", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_116", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1441", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1441", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_1977", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_765", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_as_real_117", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_117", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1442", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1442", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "convert_element_type_1978", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1443", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1977", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1444", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_1978", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "view_1445", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1443", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_1334", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1334", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_147", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "einsum_default_599", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_150", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "permute_1211", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1334", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1211", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "einsum_default_600", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_599", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "permute_1212", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1212", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "dtype_cast_532", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_532", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wv", + "name": "alias_default_1469", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1444", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_1335", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1335", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_147", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "einsum_default_601", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_149", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "permute_1215", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1335", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1215", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "einsum_default_602", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_600", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_602", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "add_347", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_601", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "permute_1216", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1216", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "dtype_cast_533", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_533", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wk", + "name": "alias_default_1468", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1445", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention", + "name": "alias_default_1336", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1336", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_147", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "einsum_default_603", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_148", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "permute_1219", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1336", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1219", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "einsum_default_604", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_347", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_604", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5", + "name": "add_348", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_603", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "permute_1220", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1220", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "dtype_cast_534", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_534", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention.wq", + "name": "alias_default_1467", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_1991", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_143", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_1992", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_144", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_1993", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1991", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_1337", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1337", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_1993", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_766", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1992", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_767", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_766", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_1338", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_767", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_1339", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1338", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_768", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_768", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "sum_163", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "div_86", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_86", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_163", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_769", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1338", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_769", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "sub_81", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_81", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_770", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1337", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1339", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "mul_771", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_771", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "sum_164", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_770", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_1994", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_164", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "convert_element_type_1995", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_1994", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "add_349", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_1995", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "dtype_cast_535", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_535", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.5.attention_norm", + "name": "alias_default_1474", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_349", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "alias_default_1340", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1340", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_141", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "einsum_default_605", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_142", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "permute_1223", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1340", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1223", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "einsum_default_606", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_605", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "permute_1224", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1224", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "dtype_cast_536", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_536", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "alias_default_1463", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_606", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w2", + "name": "alias_default_1341", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1341", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_138", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_772", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1341", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_140", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_773", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_772", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_1342", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1342", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_134", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "einsum_default_607", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_139", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "permute_1227", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1342", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1227", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "einsum_default_608", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_607", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "permute_1228", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1228", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "dtype_cast_537", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_537", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w3", + "name": "alias_default_1464", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_773", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "convert_element_type_2004", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_136", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "convert_element_type_2005", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2005", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_1343", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1343", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "neg_59", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "exp_59", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "add_350", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_350", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "reciprocal_27", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_27", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_774", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_774", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_1344", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2004", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1344", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_775", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1344", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "sub_82", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1343", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_776", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_776", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "add_351", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_775", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_351", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "mul_777", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_777", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "convert_element_type_2006", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2006", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward", + "name": "alias_default_1345", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1345", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_134", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "einsum_default_609", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_135", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "permute_1231", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1345", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1231", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "einsum_default_610", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_608", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_610", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4", + "name": "add_352", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_609", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "permute_1232", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1232", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "dtype_cast_538", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_538", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.feed_forward.w1", + "name": "alias_default_1462", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_352", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_2011", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_130", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_2012", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_131", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_2013", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2011", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_1346", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1346", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2013", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_778", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2012", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_133", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_779", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_778", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_1347", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_779", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_1348", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1347", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_780", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_780", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "sum_165", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "div_87", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_165", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_781", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1347", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_781", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "sub_83", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_83", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_133", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_782", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1346", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1348", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "mul_783", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_783", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "sum_166", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_782", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_2014", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_166", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "convert_element_type_2015", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1340", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2014", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "add_353", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2015", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "dtype_cast_539", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_539", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.ffn_norm", + "name": "alias_default_1466", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_353", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "alias_default_1349", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_128", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "einsum_default_611", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_129", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "permute_1235", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1235", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "einsum_default_612", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_611", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "permute_1236", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1236", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "dtype_cast_540", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_540", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wo", + "name": "alias_default_1461", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_612", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1460", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1460", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_1237", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1237", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_124", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_125", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_37", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_42", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_43", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_27", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_369", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_370", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_27", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.sdpa", + "name": "getitem_371", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_371", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_1238", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_1239", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "permute_1240", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1238", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1461", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1461", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "sum_167", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_167", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "squeeze_54", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1239", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1462", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1462", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "sum_168", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_168", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "squeeze_55", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_55", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_2020", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1240", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_2021", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2020", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1463", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1463", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_complex_118", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_123", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "_conj_54", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_54", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "clone_286", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_118", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_286", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "mul_784", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2021", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1464", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1464", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_complex_119", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_123", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "_conj_55", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_55", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "clone_287", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_119", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_287", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "mul_785", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_784", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_real_118", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_118", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1465", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1465", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_2022", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_785", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_as_real_119", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_119", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1466", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1466", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "convert_element_type_2023", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1467", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2022", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1468", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2023", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "view_1469", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1467", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_1350", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1350", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_119", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "einsum_default_613", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_122", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "permute_1243", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1350", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1243", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "einsum_default_614", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_613", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "permute_1244", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1244", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "dtype_cast_541", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_541", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wv", + "name": "alias_default_1460", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1468", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_1351", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1351", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_119", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "einsum_default_615", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_121", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "permute_1247", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1351", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1247", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "einsum_default_616", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_614", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_616", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "add_354", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_615", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "permute_1248", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1248", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "dtype_cast_542", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_542", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wk", + "name": "alias_default_1459", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1469", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention", + "name": "alias_default_1352", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1352", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_119", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "einsum_default_617", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_120", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "permute_1251", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1352", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1251", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "einsum_default_618", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_618", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4", + "name": "add_355", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_617", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "permute_1252", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1252", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "dtype_cast_543", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_543", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention.wq", + "name": "alias_default_1458", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_355", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_2036", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_115", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_2037", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_116", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_2038", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2036", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_1353", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1353", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2038", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_786", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2037", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_118", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_787", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_786", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_1354", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_787", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_1355", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1355", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_788", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_788", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "sum_169", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1355", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "div_88", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_88", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_169", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_789", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1354", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_789", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "sub_84", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_84", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_118", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_790", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1353", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1355", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "mul_791", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_791", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "sum_170", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_790", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_2039", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_170", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "convert_element_type_2040", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1349", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2039", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "add_356", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2040", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "dtype_cast_544", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_544", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.4.attention_norm", + "name": "alias_default_1465", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_356", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "alias_default_1356", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1356", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "einsum_default_619", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_114", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "permute_1255", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1356", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1255", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "einsum_default_620", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_619", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "permute_1256", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1256", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "dtype_cast_545", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_545", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "alias_default_1454", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_620", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w2", + "name": "alias_default_1357", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1357", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_792", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1357", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_793", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_792", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_1358", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1358", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_106", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "einsum_default_621", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_111", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "permute_1259", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1358", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1259", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "einsum_default_622", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_621", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "permute_1260", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1260", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "dtype_cast_546", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_546", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w3", + "name": "alias_default_1455", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_793", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "convert_element_type_2049", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_108", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "convert_element_type_2050", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2050", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_1359", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1359", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "neg_60", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "exp_60", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "add_357", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_357", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "reciprocal_28", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_794", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_794", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_1360", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2049", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1360", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_795", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1360", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "sub_85", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1359", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_796", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_796", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "add_358", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_795", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_358", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "mul_797", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_797", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "convert_element_type_2051", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2051", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward", + "name": "alias_default_1361", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1361", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_106", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "einsum_default_623", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_107", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "permute_1263", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1361", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1263", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "einsum_default_624", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_622", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_624", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3", + "name": "add_359", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_623", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "permute_1264", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1264", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "dtype_cast_547", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_547", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.feed_forward.w1", + "name": "alias_default_1453", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_359", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_2056", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_2057", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_103", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_2058", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2056", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_1362", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2058", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_798", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2057", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_105", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_799", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_798", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_1363", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_799", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_1364", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1364", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1363", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_800", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_800", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "sum_171", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1364", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "div_89", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_89", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_801", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1363", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_801", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "sub_86", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_86", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_105", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_802", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1364", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "mul_803", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_803", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "sum_172", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_802", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_2059", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_172", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "convert_element_type_2060", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1356", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2059", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "add_360", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2060", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "dtype_cast_548", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_548", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.ffn_norm", + "name": "alias_default_1457", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_360", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "alias_default_1365", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1365", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_100", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "einsum_default_625", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_101", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "permute_1267", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1365", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1267", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "einsum_default_626", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_625", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "permute_1268", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1268", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "dtype_cast_549", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_549", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wo", + "name": "alias_default_1452", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_626", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1484", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1484", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_1269", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1269", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_97", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_98", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_99", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_33", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_34", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_28", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_372", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_373", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_28", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.sdpa", + "name": "getitem_374", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_374", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_1270", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_373", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_1271", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_372", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "permute_1272", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1270", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1485", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1485", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "sum_173", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_173", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "squeeze_56", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1271", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1486", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1486", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "sum_174", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_174", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "squeeze_57", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_2065", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1272", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_2066", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2065", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1487", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1487", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_complex_120", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_95", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "_conj_56", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_56", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "clone_294", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_294", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "mul_804", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2066", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1488", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1488", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_complex_121", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_95", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "_conj_57", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_57", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "clone_295", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_295", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "mul_805", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_804", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_real_120", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_120", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1489", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1489", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_2067", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_805", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_as_real_121", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_121", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1490", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1490", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "convert_element_type_2068", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1491", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2067", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1492", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2068", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "view_1493", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1491", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_1366", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1366", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_91", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "einsum_default_627", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_94", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "permute_1275", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1366", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1275", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "einsum_default_628", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_627", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "permute_1276", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1276", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "dtype_cast_550", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_550", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wv", + "name": "alias_default_1451", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1492", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_1367", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1367", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_91", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "einsum_default_629", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_93", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "permute_1279", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1367", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1279", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "einsum_default_630", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_628", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_630", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "add_361", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_629", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "permute_1280", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1280", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "dtype_cast_551", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_551", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wk", + "name": "alias_default_1450", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1493", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention", + "name": "alias_default_1368", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1368", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_91", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "einsum_default_631", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_92", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "permute_1283", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1368", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1283", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "einsum_default_632", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_361", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_632", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3", + "name": "add_362", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_631", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "permute_1284", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1284", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "dtype_cast_552", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_552", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention.wq", + "name": "alias_default_1449", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_362", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_2081", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_2082", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_88", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_2083", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2081", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_1369", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2083", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_806", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2082", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_807", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_806", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_1370", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_807", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_1371", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1371", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_808", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_808", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "sum_175", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1371", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "div_90", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_175", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_809", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1370", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_809", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "sub_87", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_87", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_810", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1371", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "mul_811", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_811", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "sum_176", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_810", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_2084", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_176", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "convert_element_type_2085", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1365", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2084", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "add_363", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2085", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "dtype_cast_553", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_553", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.3.attention_norm", + "name": "alias_default_1456", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_363", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "alias_default_1372", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1372", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_85", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "einsum_default_633", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "cluster_root": "permute_1319", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_86", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "permute_1287", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "cluster_root": "einsum_default_648", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1372", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1287", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "einsum_default_634", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_633", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "permute_1288", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1288", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "dtype_cast_554", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_554", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "alias_default_1445", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "cluster_root": "alias_default_1389", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_634", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w2", + "name": "alias_default_1373", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "cluster_root": "mul_832", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1373", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_82", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_812", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "cluster_root": "mul_833", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1373", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_84", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_813", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "cluster_root": "alias_default_1390", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_812", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_1374", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1374", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_78", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "einsum_default_635", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "cluster_root": "permute_1323", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_83", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "permute_1291", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "cluster_root": "einsum_default_650", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1374", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1291", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "einsum_default_636", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_635", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "permute_1292", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1292", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "dtype_cast_555", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_555", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w3", + "name": "alias_default_1446", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "cluster_root": "convert_element_type_2139", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_813", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "convert_element_type_2094", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "cluster_root": "convert_element_type_2140", + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_80", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "convert_element_type_2095", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "cluster_root": "alias_default_1391", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2095", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_1375", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "cluster_root": "neg_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1375", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "neg_61", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "cluster_root": "exp_62", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "exp_61", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "cluster_root": "add_371", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "add_364", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "cluster_root": "reciprocal_30", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_364", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "reciprocal_29", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "cluster_root": "mul_834", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_814", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "cluster_root": "alias_default_1392", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_814", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_1376", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "cluster_root": "mul_835", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2094", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1376", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_815", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "cluster_root": "sub_91", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1376", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "sub_88", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "cluster_root": "mul_836", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1375", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_88", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_816", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "cluster_root": "add_372", + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_816", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "add_365", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "cluster_root": "mul_837", + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_815", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_365", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "mul_817", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "cluster_root": "convert_element_type_2141", + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_817", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "convert_element_type_2096", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "cluster_root": "alias_default_1393", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2096", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward", + "name": "alias_default_1377", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1377", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_78", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "einsum_default_637", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "cluster_root": "permute_1327", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_79", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "permute_1295", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "cluster_root": "einsum_default_652", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1377", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1295", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "einsum_default_638", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_636", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_638", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2", + "name": "add_366", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_637", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "permute_1296", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1296", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "dtype_cast_556", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_556", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.feed_forward.w1", + "name": "alias_default_1444", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "cluster_root": "convert_element_type_2146", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_366", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_2101", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "cluster_root": "convert_element_type_2147", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_74", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_2102", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "cluster_root": "convert_element_type_2148", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_75", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_2103", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "cluster_root": "alias_default_1394", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2101", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_1378", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "cluster_root": "mul_838", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2103", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_818", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "cluster_root": "mul_839", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2102", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_819", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "cluster_root": "alias_default_1395", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_818", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_1379", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "cluster_root": "alias_default_1396", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_819", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_1380", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "cluster_root": "mul_840", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1379", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_820", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "cluster_root": "sum_183", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_820", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "sum_177", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "cluster_root": "div_93", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "div_91", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "cluster_root": "mul_841", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_91", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_177", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_821", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "cluster_root": "sub_92", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1379", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_821", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "sub_89", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "cluster_root": "mul_842", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_89", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_77", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_822", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "mul_823", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_823", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "sum_178", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "cluster_root": "convert_element_type_2149", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_822", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_2104", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_178", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "convert_element_type_2105", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1372", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2104", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "add_367", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2105", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "dtype_cast_557", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_557", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.ffn_norm", + "name": "alias_default_1448", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_367", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "alias_default_1381", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_72", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "einsum_default_639", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "cluster_root": "permute_1331", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_73", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "permute_1299", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "cluster_root": "einsum_default_654", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1299", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "einsum_default_640", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_639", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "permute_1300", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1300", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "dtype_cast_558", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_558", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wo", + "name": "alias_default_1443", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "cluster_root": "view_1532", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_640", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1508", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "cluster_root": "permute_1333", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1508", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_1301", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "cluster_root": "_scaled_dot_product_flash_attention_backward_30", + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1301", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_68", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_69", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_70", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_71", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_19", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_24", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_25", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_29", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "cluster_root": "getitem_378", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_375", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "cluster_root": "getitem_379", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_376", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "cluster_root": "getitem_380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_29", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.sdpa", + "name": "getitem_377", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "cluster_root": "permute_1334", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_377", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_1302", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "cluster_root": "permute_1335", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_376", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_1303", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "cluster_root": "permute_1336", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_375", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "permute_1304", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "cluster_root": "view_1533", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1302", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1509", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "cluster_root": "sum_185", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1509", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "sum_179", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "cluster_root": "squeeze_60", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_179", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "squeeze_58", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "cluster_root": "view_1534", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1303", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1510", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "cluster_root": "sum_186", + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1510", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "sum_180", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "cluster_root": "squeeze_61", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_180", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "squeeze_59", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "cluster_root": "convert_element_type_2155", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_59", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_2110", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "cluster_root": "convert_element_type_2156", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1304", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_2111", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "cluster_root": "view_1535", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2110", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1511", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "cluster_root": "view_as_complex_124", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1511", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_complex_122", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "cluster_root": "_conj_60", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_67", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "_conj_58", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "cluster_root": "clone_310", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_58", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "clone_302", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "cluster_root": "mul_844", + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_122", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_302", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "mul_824", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "cluster_root": "view_1536", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2111", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1512", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "cluster_root": "view_as_complex_125", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1512", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_complex_123", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "cluster_root": "_conj_61", + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_67", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "_conj_59", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "cluster_root": "clone_311", + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_59", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "clone_303", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "cluster_root": "mul_845", + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_123", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_303", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "mul_825", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "cluster_root": "view_as_real_124", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_824", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_real_122", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "cluster_root": "view_1537", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_122", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1513", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "cluster_root": "convert_element_type_2157", + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1513", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_2112", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "cluster_root": "view_as_real_125", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_825", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_as_real_123", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "cluster_root": "view_1538", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_123", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1514", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "cluster_root": "convert_element_type_2158", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1514", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "convert_element_type_2113", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "cluster_root": "view_1539", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_58", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1515", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "cluster_root": "view_1540", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2112", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1516", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "cluster_root": "view_1541", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2113", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "view_1517", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "cluster_root": "alias_default_1398", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1515", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_1382", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1382", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_63", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "einsum_default_641", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "cluster_root": "permute_1339", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_66", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "permute_1307", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "cluster_root": "einsum_default_656", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1382", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1307", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "einsum_default_642", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_641", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "permute_1308", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1308", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "dtype_cast_559", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_559", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wv", + "name": "alias_default_1442", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "cluster_root": "alias_default_1399", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1516", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_1383", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1383", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_63", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "einsum_default_643", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "cluster_root": "permute_1343", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_65", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "permute_1311", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "cluster_root": "einsum_default_658", + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1383", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1311", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "einsum_default_644", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_642", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_644", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "add_368", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_643", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "permute_1312", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1312", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "dtype_cast_560", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_560", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wk", + "name": "alias_default_1441", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "cluster_root": "alias_default_1400", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1517", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention", + "name": "alias_default_1384", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1384", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_63", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "einsum_default_645", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "cluster_root": "permute_1347", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_64", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "permute_1315", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "cluster_root": "einsum_default_660", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1384", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1315", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "einsum_default_646", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_368", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_646", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2", + "name": "add_369", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_645", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "permute_1316", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1316", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "dtype_cast_561", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_561", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention.wq", + "name": "alias_default_1440", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "cluster_root": "convert_element_type_2171", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_369", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_2126", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "cluster_root": "convert_element_type_2172", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_59", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_2127", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "cluster_root": "convert_element_type_2173", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_60", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_2128", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "cluster_root": "alias_default_1401", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2126", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_1385", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "cluster_root": "mul_846", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1385", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2128", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_826", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "cluster_root": "mul_847", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2127", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_827", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "cluster_root": "alias_default_1402", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_826", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_1386", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "cluster_root": "alias_default_1403", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_827", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_1387", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "cluster_root": "mul_848", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1387", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1386", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_828", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "cluster_root": "sum_187", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_828", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "sum_181", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "cluster_root": "div_94", + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1387", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "div_92", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "cluster_root": "mul_849", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_92", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_181", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_829", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "cluster_root": "sub_93", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1386", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_829", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "sub_90", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "cluster_root": "mul_850", + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_90", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_62", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_830", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1385", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1387", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "mul_831", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_831", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "sum_182", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "cluster_root": "convert_element_type_2174", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_830", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_2129", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_182", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "convert_element_type_2130", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2129", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "add_370", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2130", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "dtype_cast_562", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_562", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.2.attention_norm", + "name": "alias_default_1447", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_370", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "alias_default_1388", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "cluster_root": "einsum_default_661", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1388", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_57", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "einsum_default_647", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 113, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_58", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "permute_1319", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 114, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1388", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1319", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "einsum_default_648", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "cluster_root": "permute_1352", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_647", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "permute_1320", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "cluster_root": "dtype_cast_572", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1320", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "dtype_cast_563", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "cluster_root": "alias_default_1427", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_563", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "alias_default_1436", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 115, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_648", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w2", + "name": "alias_default_1389", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 116, + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1389", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_54", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_832", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 117, + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1389", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_56", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_833", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 118, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_832", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_1390", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "cluster_root": "einsum_default_663", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_50", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "einsum_default_649", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 119, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_55", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "permute_1323", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 120, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1390", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1323", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "einsum_default_650", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "cluster_root": "permute_1356", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_649", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "permute_1324", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "cluster_root": "dtype_cast_573", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1324", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "dtype_cast_564", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "cluster_root": "alias_default_1428", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_564", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w3", + "name": "alias_default_1437", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 121, + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_833", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "convert_element_type_2139", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 122, + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_52", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "convert_element_type_2140", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 123, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2140", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_1391", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 124, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1391", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "neg_62", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 125, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "exp_62", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 126, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "add_371", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 127, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_371", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "reciprocal_30", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 128, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_30", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_834", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 129, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_834", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_1392", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 130, + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2139", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_835", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 131, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1392", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "sub_91", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 132, + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1391", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_91", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_836", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 133, + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_836", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "add_372", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 134, + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_835", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_372", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "mul_837", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 135, + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_837", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "convert_element_type_2141", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 136, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2141", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward", + "name": "alias_default_1393", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "cluster_root": "einsum_default_665", + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_50", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "einsum_default_651", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 137, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_51", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "permute_1327", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 138, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1393", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1327", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "einsum_default_652", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 139, + "cluster_root": "add_163", + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_650", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_652", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1", + "name": "add_373", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "cluster_root": "permute_1360", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_651", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "permute_1328", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "cluster_root": "dtype_cast_574", + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1328", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "dtype_cast_565", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "cluster_root": "alias_default_1426", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_565", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.feed_forward.w1", + "name": "alias_default_1435", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 140, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_373", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_2146", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 141, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_46", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_2147", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 142, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_47", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_2148", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 143, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2146", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_1394", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 144, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1394", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2148", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_838", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 145, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2147", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_839", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 146, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_838", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_1395", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 147, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_839", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_1396", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 148, + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1396", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1395", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_840", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 149, + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_840", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "sum_183", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 150, + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1396", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "div_93", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 151, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_93", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_183", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_841", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 152, + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1395", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_841", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "sub_92", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 153, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_92", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_49", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_842", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "cluster_root": "mul_863", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1394", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1396", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "mul_843", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "cluster_root": "sum_190", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_843", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "sum_184", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 154, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_842", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_2149", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "cluster_root": "convert_element_type_2195", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_184", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "convert_element_type_2150", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 155, + "cluster_root": "add_164", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1388", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2149", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "add_374", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "cluster_root": "dtype_cast_575", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2150", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "dtype_cast_566", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "cluster_root": "alias_default_1430", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_566", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.ffn_norm", + "name": "alias_default_1439", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 156, + "cluster_root": "alias_default_917", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_374", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "alias_default_1397", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "cluster_root": "einsum_default_667", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1397", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_44", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "einsum_default_653", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 157, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_45", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "permute_1331", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 158, + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1397", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1331", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "einsum_default_654", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "cluster_root": "permute_1364", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_653", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "permute_1332", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "cluster_root": "dtype_cast_576", + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1332", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "dtype_cast_567", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "cluster_root": "alias_default_1425", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_567", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wo", + "name": "alias_default_1434", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 159, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_654", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1532", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 160, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1532", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_1333", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 161, + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1333", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_40", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_41", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_42", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_43", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_10", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_15", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_16", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_30", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 162, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_378", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 163, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_379", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 164, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_30", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.sdpa", + "name": "getitem_380", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 165, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_380", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_1334", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 166, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_379", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_1335", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 167, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_378", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "permute_1336", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 168, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1334", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1533", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 169, + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1533", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "sum_185", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 170, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_185", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "squeeze_60", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 171, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1335", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1534", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 172, + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1534", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "sum_186", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 173, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_186", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "squeeze_61", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 174, + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_61", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_2155", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 175, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1336", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_2156", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 176, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2155", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1535", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 177, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1535", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_complex_124", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 178, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_39", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "_conj_60", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 179, + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_60", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "clone_310", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 180, + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_124", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_310", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "mul_844", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 181, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2156", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1536", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 182, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1536", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_complex_125", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 183, + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_39", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "_conj_61", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 184, + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_61", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "clone_311", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 185, + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_125", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_311", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "mul_845", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 186, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_844", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_real_124", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 187, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_124", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1537", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 188, + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1537", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_2157", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 189, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_845", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_as_real_125", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 190, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_125", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1538", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 191, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1538", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "convert_element_type_2158", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 192, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_60", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1539", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 193, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2157", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1540", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 194, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2158", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "view_1541", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 195, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1539", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_1398", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "cluster_root": "einsum_default_669", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1398", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_35", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "einsum_default_655", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 196, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_38", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "permute_1339", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 197, + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1398", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1339", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "einsum_default_656", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "cluster_root": "permute_1372", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_655", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "permute_1340", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "cluster_root": "dtype_cast_577", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1340", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "dtype_cast_568", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "cluster_root": "alias_default_1424", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_568", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wv", + "name": "alias_default_1433", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 198, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1540", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_1399", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "cluster_root": "einsum_default_671", + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1399", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_35", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "einsum_default_657", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 199, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_37", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "permute_1343", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 200, + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1399", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1343", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "einsum_default_658", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 201, + "cluster_root": "add_165", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_656", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_658", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "add_375", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "cluster_root": "permute_1376", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_657", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "permute_1344", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "cluster_root": "dtype_cast_578", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1344", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "dtype_cast_569", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "cluster_root": "alias_default_1423", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_569", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wk", + "name": "alias_default_1432", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 202, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1541", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention", + "name": "alias_default_1400", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "cluster_root": "einsum_default_673", + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1400", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_35", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "einsum_default_659", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 203, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_36", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "permute_1347", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 204, + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1400", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1347", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "einsum_default_660", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 205, + "cluster_root": "add_166", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_375", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_660", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1", + "name": "add_376", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "cluster_root": "permute_1380", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_659", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "permute_1348", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "cluster_root": "dtype_cast_579", + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1348", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "dtype_cast_570", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "cluster_root": "alias_default_1422", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_570", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention.wq", + "name": "alias_default_1431", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 206, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_376", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_2171", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 207, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_2172", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 208, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_32", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_2173", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 209, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2171", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_1401", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 210, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1401", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2173", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_846", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 211, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2172", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_847", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 212, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_846", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_1402", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 213, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_847", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_1403", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 214, + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1403", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1402", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_848", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 215, + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_848", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "sum_187", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 216, + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1403", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "div_94", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 217, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_94", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_187", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_849", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 218, + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1402", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_849", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "sub_93", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 219, + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_93", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_34", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_850", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "cluster_root": "mul_871", + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1401", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1403", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "mul_851", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "cluster_root": "sum_194", + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_851", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "sum_188", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 220, + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_850", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_2174", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "cluster_root": "convert_element_type_2220", + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_188", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "convert_element_type_2175", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 221, + "cluster_root": "add_167", + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1397", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2174", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "add_377", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "cluster_root": "dtype_cast_580", + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2175", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "dtype_cast_571", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "cluster_root": "alias_default_1429", + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_571", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.1.attention_norm", + "name": "alias_default_1438", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 222, + "cluster_root": "alias_default_924", + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)R", + "name": "add_377", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "alias_default_1404", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)R", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 109, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1404", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_29", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "einsum_default_661", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "alias_default_30", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "permute_1351", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_1404", + "src_placement": "S(0)R", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "permute_1351", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "einsum_default_662", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 110, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "einsum_default_661", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "permute_1352", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 111, + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "permute_1352", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "dtype_cast_572", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 112, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(1)", + "name": "dtype_cast_572", + "src_placement": "P(sum)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "alias_default_1427", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_662", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w2", + "name": "alias_default_1405", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1405", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_26", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_852", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1405", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_28", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_853", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_852", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_1406", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 223, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1406", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_22", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "einsum_default_663", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_27", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "permute_1355", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1406", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1355", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "einsum_default_664", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 224, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_663", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "permute_1356", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 225, + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1356", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "dtype_cast_573", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 226, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_573", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w3", + "name": "alias_default_1428", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_853", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "convert_element_type_2184", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 136.64587220149252, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_24", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "convert_element_type_2185", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2185", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_1407", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1407", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "neg_63", + "op": "aten.neg.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "neg_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "exp_63", + "op": "aten.exp.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "exp_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "add_378", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_378", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "reciprocal_31", + "op": "aten.reciprocal.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "reciprocal_31", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_854", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_854", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_1408", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2184", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1408", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_855", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1408", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "sub_94", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1407", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sub_94", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_856", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 182.1944962686567, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_856", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "add_379", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 273.29174440298505, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_855", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "add_379", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "mul_857", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 136.64587220149252, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_857", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "convert_element_type_2186", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2186", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward", + "name": "alias_default_1409", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 227, + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1409", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_22", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "einsum_default_665", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 14336 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_23", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "permute_1359", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 694.8379851971689, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1409", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RS(0)", + "name": "permute_1359", + "src_placement": "RS(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "einsum_default_666", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 156.16671108742005, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_664", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)P(sum)", + "name": "einsum_default_666", + "src_placement": "S(0)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0", + "name": "add_380", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)P(sum)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 228, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_665", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "permute_1360", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 355 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 229, + "compute_cost": 34.16146805037313, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1360", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "dtype_cast_574", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 230, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 487.952, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_574", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.feed_forward.w1", + "name": "alias_default_1426", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 430.3685785129651, + "dst_placement": "S(0)S(1)", + "name": "add_380", + "src_placement": "S(0)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_2191", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_18", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_2192", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_19", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_2193", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2191", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_1410", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1410", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2193", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_858", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2192", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_859", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_858", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_1411", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_859", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_1412", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1412", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1411", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_860", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_860", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "sum_189", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1412", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "div_95", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_95", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_189", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_861", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1411", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_861", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "sub_95", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_95", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_21", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_862", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 231, + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1410", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1412", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "mul_863", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 232, + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_863", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "sum_190", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_862", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_2194", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 233, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_190", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "convert_element_type_2195", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1404", + "src_placement": "S(0)R", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2194", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "add_381", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 1.0 + }, + { + "cluster_id": 234, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2195", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "dtype_cast_575", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 235, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_575", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.ffn_norm", + "name": "alias_default_1430", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "alias_default_1413", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 236, + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1413", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_16", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "einsum_default_667", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_17", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "permute_1363", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1413", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1363", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "einsum_default_668", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 237, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "einsum_default_667", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "permute_1364", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return self.wo(output)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 316 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 238, + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "permute_1364", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "dtype_cast_576", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 239, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 331.9007188940092, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_576", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wo", + "name": "alias_default_1425", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(2)", + "name": "einsum_default_668", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1556", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "output = output.view(bs, seqlen, -1)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 315 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1556", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_1365", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "output = output.transpose(", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 312 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 1985.2513862776257, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "permute_1365", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_12", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_13", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_14", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_15", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_1", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_6", + "src_placement": "RR", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "getitem_7", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "_scaled_dot_product_flash_attention_backward_31", + "op": "aten._scaled_dot_product_flash_attention_backward.default", + "phase": "backward", + "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem_381", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem_382", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "_scaled_dot_product_flash_attention_backward_31", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.sdpa", + "name": "getitem_383", + "op": "", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 32, + 8192, + 128 + ], + "source": { + "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 53 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_383", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_1366", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 308 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_1367", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 307 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "getitem_381", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "permute_1368", + "op": "aten.permute.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 306 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1366", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1557", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1557", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "sum_191", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_191", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "squeeze_62", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1367", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1558", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 4, + 128 + ], + "source": { + "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 223 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 16.26736573827292, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1558", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "sum_192", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 1, + 128 + ], + "source": { + "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 222 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "sum_192", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "squeeze_63", + "op": "aten.squeeze.dim", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "torch.unsqueeze(x, dim=3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "repeat_kv", + "line": 221 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_63", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_2200", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "permute_1368", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_2201", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 212 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2200", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1559", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1559", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_complex_126", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "_conj_62", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_62", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "clone_318", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 14.64062916444563, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_126", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_318", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "mul_864", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64 + ], + "source": { + "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 211 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2201", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1560", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1560", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_complex_127", + "op": "aten.view_as_complex.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_11", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "_conj_63", + "op": "aten._conj.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "_conj_63", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "clone_319", + "op": "aten.clone.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1, + 8192, + 1, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 53.68230693630064, + "dtype": "complex64", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_complex_127", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "clone_319", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "mul_865", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64 + ], + "source": { + "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 210 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_864", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_real_126", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 64, + 2 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_126", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1561", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 9.760419442963753, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1561", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_2202", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 8, + 128 + ], + "source": { + "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 208 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "mul_865", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_as_real_127", + "op": "aten.view_as_real.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 64, + 2 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_as_real_127", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1562", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1562", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "convert_element_type_2203", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 32, + 128 + ], + "source": { + "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "apply_rotary_emb", + "line": 207 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "squeeze_62", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1563", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2202", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1564", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "convert_element_type_2203", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "view_1565", + "op": "aten.view.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1563", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_1414", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 297 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 240, + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1414", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_7", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "einsum_default_669", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_10", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "permute_1371", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1414", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 36.328589861751155, + "dst_placement": "RR", + "name": "permute_1371", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "einsum_default_670", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "cluster_id": 241, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_669", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "permute_1372", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 242, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1372", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "dtype_cast_577", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 243, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_577", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wv", + "name": "alias_default_1424", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1564", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_1415", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 1024 + ], + "source": { + "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 296 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 244, + "compute_cost": 56.12241179704158, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1415", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_7", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "einsum_default_671", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 1024 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_9", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "permute_1375", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 49.631284656940636, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 86.07528421052632, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1415", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "permute_1375", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "einsum_default_672", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_670", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_672", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "add_382", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 245, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_671", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "permute_1376", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 246, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1376", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "dtype_cast_578", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 247, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 57.40529711375213, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_578", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wk", + "name": "alias_default_1423", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "view_1565", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention", + "name": "alias_default_1416", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(2)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 295 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 248, + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(2)", + "name": "alias_default_1416", + "src_placement": "S(0)S(2)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)R", + "name": "alias_default_7", + "src_placement": "S(0)R", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "einsum_default_673", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "P(sum)S(1)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RS(1)", + "name": "alias_default_8", + "src_placement": "RS(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "permute_1379", + "op": "aten.permute.default", + "phase": "backward", + "placement": "RS(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 198.52513862776254, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 190.35670720457864, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1416", + "src_placement": "S(0)S(2)", + "transition_cost": 1 + }, + { + "comm_cost": 94.3143594470046, + "dst_placement": "RR", + "name": "permute_1379", + "src_placement": "RS(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "einsum_default_674", + "op": "aten.einsum.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 2.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_382", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "einsum_default_674", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0", + "name": "add_383", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 249, + "compute_cost": 0.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(1)", + "name": "einsum_default_673", + "src_placement": "P(sum)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "permute_1380", + "op": "aten.permute.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 290 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 250, + "compute_cost": 9.760419442963753, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)S(0)", + "name": "permute_1380", + "src_placement": "P(sum)S(0)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "dtype_cast_579", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 251, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 160.272, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_579", + "src_placement": "P(sum)S(0)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention.wq", + "name": "alias_default_1422", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "add_383", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type_2216", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_3", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type_2217", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_4", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type_2218", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "RR", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2216", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_1417", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1417", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "convert_element_type_2218", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_866", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2217", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_867", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_866", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_1418", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_867", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_1419", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1419", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1418", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_868", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_868", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "sum_193", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 1 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.05557036247335, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1419", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "div_96", + "op": "aten.div.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "div_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sum_193", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_869", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1418", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_869", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "sub_96", + "op": "aten.sub.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 52.06192480221486, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "sub_96", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_6", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_870", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 252, + "compute_cost": 78.08335554371003, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1417", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1419", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "mul_871", + "op": "aten.mul.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 253, + "compute_cost": 26.034139620978188, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_871", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "sum_194", + "op": "aten.sum.dim_IntList", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "mul_870", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type_2219", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 254, + "compute_cost": 7.0, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "sum_194", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "convert_element_type_2220", + "op": "prims.convert_element_type.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 39.041677771855014, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "alias_default_1413", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + }, + { + "comm_cost": 0.0, + "dst_placement": "S(0)S(1)", + "name": "convert_element_type_2219", + "src_placement": "S(0)S(1)", + "transition_cost": 0 + } + ], + "name": "add_384", + "op": "aten.add.Tensor", + "phase": "backward", + "placement": "S(0)S(1)", + "shape": [ + 8, + 8192, + 4096 + ], + "source": { + "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", + "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", + "func": "rms_norm", + "line": 2964 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 255, + "compute_cost": 7.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "P(sum)P(sum)", + "name": "convert_element_type_2220", + "src_placement": "P(sum)P(sum)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "dtype_cast_580", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "P(sum)P(sum)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "cluster_id": 256, + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 28.41652038284296, + "dst_placement": "S(0)S(0)", + "name": "dtype_cast_580", + "src_placement": "P(sum)P(sum)", + "transition_cost": 1 + } + ], + "module_path": "L['self'].layers.0.attention_norm", + "name": "alias_default_1429", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(0)S(0)", + "shape": [ + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 38.685829146330285, + "dtype": "bfloat16", + "inputs": [ + { + "comm_cost": 706.2108351658422, + "dst_placement": "S(2)S(2)", + "name": "add_384", + "src_placement": "S(0)S(1)", + "transition_cost": 1 + }, + { + "comm_cost": 0.0, + "dst_placement": "RR", + "name": "alias_default_1", + "src_placement": "RR", + "transition_cost": 0 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "embedding_dense_backward", + "op": "aten.embedding_dense_backward.default", + "phase": "backward", + "placement": "S(1)S(1)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", + "func": "forward", + "line": 539 + }, + "transition_cost": 1.0 + }, + { + "compute_cost": 76.40578345195063, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(1)S(1)", + "name": "embedding_dense_backward", + "src_placement": "S(1)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "dtype_cast_581", + "op": "autoparallel.dtype_cast.default", + "phase": "backward", + "placement": "S(1)S(1)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "compute_cost": 0.0, + "dtype": "float32", + "inputs": [ + { + "comm_cost": 0.0, + "dst_placement": "S(1)S(1)", + "name": "dtype_cast_581", + "src_placement": "S(1)S(1)", + "transition_cost": 0 + } + ], + "module_path": "L['self'].tok_embeddings", + "name": "alias_default_1421", + "op": "aten.alias.default", + "phase": "backward", + "placement": "S(1)S(1)", + "shape": [ + 128256, + 4096 + ], + "source": { + "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", + "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", + "func": "getter", + "line": 25 + }, + "transition_cost": 0.0 + }, + { + "inputs": [ + { + "name": "alias_default_1420" + }, + { + "name": "alias_default_1421" + }, + { + "name": "alias_default_1422" + }, + { + "name": "alias_default_1423" + }, + { + "name": "alias_default_1424" + }, + { + "name": "alias_default_1425" + }, + { + "name": "alias_default_1426" + }, + { + "name": "alias_default_1427" + }, + { + "name": "alias_default_1428" + }, + { + "name": "alias_default_1429" + }, + { + "name": "alias_default_1430" + }, + { + "name": "alias_default_1431" + }, + { + "name": "alias_default_1432" + }, + { + "name": "alias_default_1433" + }, + { + "name": "alias_default_1434" + }, + { + "name": "alias_default_1435" + }, + { + "name": "alias_default_1436" + }, + { + "name": "alias_default_1437" + }, + { + "name": "alias_default_1438" + }, + { + "name": "alias_default_1439" + }, + { + "name": "alias_default_1440" + }, + { + "name": "alias_default_1441" + }, + { + "name": "alias_default_1442" + }, + { + "name": "alias_default_1443" + }, + { + "name": "alias_default_1444" + }, + { + "name": "alias_default_1445" + }, + { + "name": "alias_default_1446" + }, + { + "name": "alias_default_1447" + }, + { + "name": "alias_default_1448" + }, + { + "name": "alias_default_1449" + }, + { + "name": "alias_default_1450" + }, + { + "name": "alias_default_1451" + }, + { + "name": "alias_default_1452" + }, + { + "name": "alias_default_1453" + }, + { + "name": "alias_default_1454" + }, + { + "name": "alias_default_1455" + }, + { + "name": "alias_default_1456" + }, + { + "name": "alias_default_1457" + }, + { + "name": "alias_default_1458" + }, + { + "name": "alias_default_1459" + }, + { + "name": "alias_default_1460" + }, + { + "name": "alias_default_1461" + }, + { + "name": "alias_default_1462" + }, + { + "name": "alias_default_1463" + }, + { + "name": "alias_default_1464" + }, + { + "name": "alias_default_1465" + }, + { + "name": "alias_default_1466" + }, + { + "name": "alias_default_1467" + }, + { + "name": "alias_default_1468" + }, + { + "name": "alias_default_1469" + }, + { + "name": "alias_default_1470" + }, + { + "name": "alias_default_1471" + }, + { + "name": "alias_default_1472" + }, + { + "name": "alias_default_1473" + }, + { + "name": "alias_default_1474" + }, + { + "name": "alias_default_1475" + }, + { + "name": "alias_default_1476" + }, + { + "name": "alias_default_1477" + }, + { + "name": "alias_default_1478" + }, + { + "name": "alias_default_1479" + }, + { + "name": "alias_default_1480" + }, + { + "name": "alias_default_1481" + }, + { + "name": "alias_default_1482" + }, + { + "name": "alias_default_1483" + }, + { + "name": "alias_default_1484" + }, + { + "name": "alias_default_1485" + }, + { + "name": "alias_default_1486" + }, + { + "name": "alias_default_1487" + }, + { + "name": "alias_default_1488" + }, + { + "name": "alias_default_1489" + }, + { + "name": "alias_default_1490" + }, + { + "name": "alias_default_1491" + }, + { + "name": "alias_default_1492" + }, + { + "name": "alias_default_1493" + }, + { + "name": "alias_default_1494" + }, + { + "name": "alias_default_1495" + }, + { + "name": "alias_default_1496" + }, + { + "name": "alias_default_1497" + }, + { + "name": "alias_default_1498" + }, + { + "name": "alias_default_1499" + }, + { + "name": "alias_default_1500" + }, + { + "name": "alias_default_1501" + }, + { + "name": "alias_default_1502" + }, + { + "name": "alias_default_1503" + }, + { + "name": "alias_default_1504" + }, + { + "name": "alias_default_1505" + }, + { + "name": "alias_default_1506" + }, + { + "name": "alias_default_1507" + }, + { + "name": "alias_default_1508" + }, + { + "name": "alias_default_1509" + }, + { + "name": "alias_default_1510" + }, + { + "name": "alias_default_1511" + }, + { + "name": "alias_default_1512" + }, + { + "name": "alias_default_1513" + }, + { + "name": "alias_default_1514" + }, + { + "name": "alias_default_1515" + }, + { + "name": "alias_default_1516" + }, + { + "name": "alias_default_1517" + }, + { + "name": "alias_default_1518" + }, + { + "name": "alias_default_1519" + }, + { + "name": "alias_default_1520" + }, + { + "name": "alias_default_1521" + }, + { + "name": "alias_default_1522" + }, + { + "name": "alias_default_1523" + }, + { + "name": "alias_default_1524" + }, + { + "name": "alias_default_1525" + }, + { + "name": "alias_default_1526" + }, + { + "name": "alias_default_1527" + }, + { + "name": "alias_default_1528" + }, + { + "name": "alias_default_1529" + }, + { + "name": "alias_default_1530" + }, + { + "name": "alias_default_1531" + }, + { + "name": "alias_default_1532" + }, + { + "name": "alias_default_1533" + }, + { + "name": "alias_default_1534" + }, + { + "name": "alias_default_1535" + }, + { + "name": "alias_default_1536" + }, + { + "name": "alias_default_1537" + }, + { + "name": "alias_default_1538" + }, + { + "name": "alias_default_1539" + }, + { + "name": "alias_default_1540" + }, + { + "name": "alias_default_1541" + }, + { + "name": "alias_default_1542" + }, + { + "name": "alias_default_1543" + }, + { + "name": "alias_default_1544" + }, + { + "name": "alias_default_1545" + }, + { + "name": "alias_default_1546" + }, + { + "name": "alias_default_1547" + }, + { + "name": "alias_default_1548" + }, + { + "name": "alias_default_1549" + }, + { + "name": "alias_default_1550" + }, + { + "name": "alias_default_1551" + }, + { + "name": "alias_default_1552" + }, + { + "name": "alias_default_1553" + }, + { + "name": "alias_default_1554" + }, + { + "name": "alias_default_1555" + }, + { + "name": "alias_default_1556" + }, + { + "name": "alias_default_1557" + }, + { + "name": "alias_default_1558" + }, + { + "name": "alias_default_1559" + }, + { + "name": "alias_default_1560" + }, + { + "name": "alias_default_1561" + }, + { + "name": "alias_default_1562" + }, + { + "name": "alias_default_1563" + }, + { + "name": "alias_default_1564" + }, + { + "name": "alias_default_1565" + }, + { + "name": "alias_default_1566" + }, + { + "name": "alias_default_1567" + }, + { + "name": "alias_default_1568" + }, + { + "name": "alias_default_1569" + }, + { + "name": "alias_default_1570" + }, + { + "name": "alias_default_1571" + }, + { + "name": "alias_default_1572" + }, + { + "name": "alias_default_1573" + }, + { + "name": "alias_default_1574" + }, + { + "name": "alias_default_1575" + }, + { + "name": "alias_default_1576" + }, + { + "name": "alias_default_1577" + }, + { + "name": "alias_default_1578" + }, + { + "name": "alias_default_1579" + }, + { + "name": "alias_default_1580" + }, + { + "name": "alias_default_1581" + }, + { + "name": "alias_default_1582" + }, + { + "name": "alias_default_1583" + }, + { + "name": "alias_default_1584" + }, + { + "name": "alias_default_1585" + }, + { + "name": "alias_default_1586" + }, + { + "name": "alias_default_1587" + }, + { + "name": "alias_default_1588" + }, + { + "name": "alias_default_1589" + }, + { + "name": "alias_default_1590" + }, + { + "name": "alias_default_1591" + }, + { + "name": "alias_default_1592" + }, + { + "name": "alias_default_1593" + }, + { + "name": "alias_default_1594" + }, + { + "name": "alias_default_1595" + }, + { + "name": "alias_default_1596" + }, + { + "name": "alias_default_1597" + }, + { + "name": "alias_default_1598" + }, + { + "name": "alias_default_1599" + }, + { + "name": "alias_default_1600" + }, + { + "name": "alias_default_1601" + }, + { + "name": "alias_default_1602" + }, + { + "name": "alias_default_1603" + }, + { + "name": "alias_default_1604" + }, + { + "name": "alias_default_1605" + }, + { + "name": "alias_default_1606" + }, + { + "name": "alias_default_1607" + }, + { + "name": "alias_default_1608" + }, + { + "name": "alias_default_1609" + }, + { + "name": "alias_default_1610" + }, + { + "name": "alias_default_1611" + }, + { + "name": "alias_default_1612" + }, + { + "name": "alias_default_1613" + }, + { + "name": "alias_default_1614" + }, + { + "name": "alias_default_1615" + }, + { + "name": "alias_default_1616" + }, + { + "name": "alias_default_1617" + }, + { + "name": "alias_default_1618" + }, + { + "name": "alias_default_1619" + }, + { + "name": "alias_default_1620" + }, + { + "name": "alias_default_1621" + }, + { + "name": "alias_default_1622" + }, + { + "name": "alias_default_1623" + }, + { + "name": "alias_default_1624" + }, + { + "name": "alias_default_1625" + }, + { + "name": "alias_default_1626" + }, + { + "name": "alias_default_1627" + }, + { + "name": "alias_default_1628" + }, + { + "name": "alias_default_1629" + }, + { + "name": "alias_default_1630" + }, + { + "name": "alias_default_1631" + }, + { + "name": "alias_default_1632" + }, + { + "name": "alias_default_1633" + }, + { + "name": "alias_default_1634" + }, + { + "name": "alias_default_1635" + }, + { + "name": "alias_default_1636" + }, + { + "name": "alias_default_1637" + }, + { + "name": "alias_default_1638" + }, + { + "name": "alias_default_1639" + }, + { + "name": "alias_default_1640" + }, + { + "name": "alias_default_1641" + }, + { + "name": "alias_default_1642" + }, + { + "name": "alias_default_1643" + }, + { + "name": "alias_default_1644" + }, + { + "name": "alias_default_1645" + }, + { + "name": "alias_default_1646" + }, + { + "name": "alias_default_1647" + }, + { + "name": "alias_default_1648" + }, + { + "name": "alias_default_1649" + }, + { + "name": "alias_default_1650" + }, + { + "name": "alias_default_1651" + }, + { + "name": "alias_default_1652" + }, + { + "name": "alias_default_1653" + }, + { + "name": "alias_default_1654" + }, + { + "name": "alias_default_1655" + }, + { + "name": "alias_default_1656" + }, + { + "name": "alias_default_1657" + }, + { + "name": "alias_default_1658" + }, + { + "name": "alias_default_1659" + }, + { + "name": "alias_default_1660" + }, + { + "name": "alias_default_1661" + }, + { + "name": "alias_default_1662" + }, + { + "name": "alias_default_1663" + }, + { + "name": "alias_default_1664" + }, + { + "name": "alias_default_1665" + }, + { + "name": "alias_default_1666" + }, + { + "name": "alias_default_1667" + }, + { + "name": "alias_default_1668" + }, + { + "name": "alias_default_1669" + }, + { + "name": "alias_default_1670" + }, + { + "name": "alias_default_1671" + }, + { + "name": "alias_default_1672" + }, + { + "name": "alias_default_1673" + }, + { + "name": "alias_default_1674" + }, + { + "name": "alias_default_1675" + }, + { + "name": "alias_default_1676" + }, + { + "name": "alias_default_1677" + }, + { + "name": "alias_default_1678" + }, + { + "name": "alias_default_1679" + }, + { + "name": "alias_default_1680" + }, + { + "name": "alias_default_1681" + }, + { + "name": "alias_default_1682" + }, + { + "name": "alias_default_1683" + }, + { + "name": "alias_default_1684" + }, + { + "name": "alias_default_1685" + }, + { + "name": "alias_default_1686" + }, + { + "name": "alias_default_1687" + }, + { + "name": "alias_default_1688" + }, + { + "name": "alias_default_1689" + }, + { + "name": "alias_default_1690" + }, + { + "name": "alias_default_1691" + }, + { + "name": "alias_default_1692" + }, + { + "name": "alias_default_1693" + }, + { + "name": "alias_default_1694" + }, + { + "name": "alias_default_1695" + }, + { + "name": "alias_default_1696" + }, + { + "name": "alias_default_1697" + }, + { + "name": "alias_default_1698" + }, + { + "name": "alias_default_1699" + }, + { + "name": "alias_default_1700" + }, + { + "name": "alias_default_1701" + }, + { + "name": "alias_default_1702" + }, + { + "name": "alias_default_1703" + }, + { + "name": "alias_default_1704" + }, + { + "name": "alias_default_1705" + }, + { + "name": "alias_default_1706" + }, + { + "name": "alias_default_1707" + }, + { + "name": "alias_default_1708" + }, + { + "name": "alias_default_1709" + }, + { + "name": "alias_default_1710" + }, + { + "name": "alias_default_1711" + } + ], + "name": "output", + "op": "output" + } + ], + "summary": { + "comm": 212780.17498325979, + "compute": 581120.8234224034, + "total": 794933.9984056632, + "transition": 1033.0 + } +} \ No newline at end of file diff --git a/profile_results/llama3_8b_4x4_strategy_summary.json b/profile_results/llama3_8b_4x4_strategy_summary.json new file mode 100644 index 00000000..ccdeb4d9 --- /dev/null +++ b/profile_results/llama3_8b_4x4_strategy_summary.json @@ -0,0 +1,2054 @@ +{ + "config": { + "batch_size": 8, + "input_constraint": "Shard(0), Replicate()", + "mesh_dim_names": [ + "dp", + "tp" + ], + "mesh_shape": [ + 4, + 4 + ], + "model": "autoparallel._testing.models.llama3 Transformer 8B config", + "output_constraint": "Shard(0), Shard(2)", + "seqlen": 8192, + "vocab_size": 128256, + "world_size": 16 + }, + "elapsed_s": 115.23945621983148, + "json_summary": { + "comm": 212780.17498325979, + "compute": 581120.8234224034, + "total": 794933.9984056632, + "transition": 1033.0 + }, + "optimizer_profile": { + "ilp": { + "cluster_copied_decision_variables": 8181840, + "constraints": 175408, + "logical_decision_variables": 8657526, + "unique_variables": 475686 + }, + "last_solve": { + "constraints": 175412, + "extract_s": 0.044429945992305875, + "kind": "solve", + "objective": 794933.998405679, + "objective_s": 3.8023465629667044, + "pipeline_total_s": 102.16174313612282, + "solve_s": 59.80278266593814, + "status": "Optimal", + "total_s": 63.73084603413008, + "unique_variables": 475686 + }, + "mesh": { + "dim_names": [ + "dp", + "tp" + ], + "ndim": 2, + "shape": [ + 4, + 4 + ], + "size": 16 + }, + "model": { + "graph_nodes": 8668, + "op_counts": { + "call_function": 8373, + "output": 1, + "placeholder": 294 + }, + "parameter_bytes": 32121044992, + "parameter_nodes": 291, + "parameter_numel": 8030261248, + "tensor_nodes": 8667, + "unknown_parameter_nodes": 0 + }, + "strategies": { + "max_strategies_per_node": 81, + "nodes": 8668, + "option_tuples": 8657526, + "strategy_options": 220687 + }, + "timings": { + "compute_cost_estimation_s": 1.9735342266503721, + "constraint_construction_s": 3.2506618059705943, + "cost_estimation_s": 4.9254587206523865, + "decision_var_build_s": 15.363263476872817, + "decision_var_overhead_s": 6.9146421970799565, + "edge_cost_estimation_s": 2.9519244940020144, + "ilp_construction_s": 13.688466562191024, + "init_total_s": 38.43089710199274, + "pulp_var_creation_s": 3.5231625591404736, + "strategy_enumeration_s": 10.847158421995118, + "validation_s": 0.060926787089556456 + } + }, + "param_strategy_groups": { + "layers.*.attention.wk.weight": { + "S(0)S(0)": 32 + }, + "layers.*.attention.wo.weight": { + "S(0)S(0)": 32 + }, + "layers.*.attention.wq.weight": { + "S(0)S(0)": 32 + }, + "layers.*.attention.wv.weight": { + "S(0)S(0)": 32 + }, + "layers.*.attention_norm.weight": { + "S(0)S(0)": 32 + }, + "layers.*.feed_forward.w1.weight": { + "S(0)S(0)": 32 + }, + "layers.*.feed_forward.w2.weight": { + "S(0)S(1)": 32 + }, + "layers.*.feed_forward.w3.weight": { + "S(0)S(0)": 32 + }, + "layers.*.ffn_norm.weight": { + "S(0)S(0)": 32 + }, + "norm.weight": { + "S(0)S(0)": 1 + }, + "output.weight": { + "S(0)S(0)": 1 + }, + "tok_embeddings.weight": { + "S(1)S(1)": 1 + } + }, + "phase_placement_counts": { + "backward": [ + [ + "S(0)S(2)", + 1634 + ], + [ + "S(0)S(1)", + 1423 + ], + [ + "P(sum)S(0)", + 354 + ], + [ + "P(sum)P(sum)", + 291 + ], + [ + "S(0)S(0)", + 258 + ], + [ + "RR", + 257 + ], + [ + "P(sum)S(1)", + 225 + ], + [ + "RS(0)", + 129 + ], + [ + "S(0)P(sum)", + 97 + ], + [ + "S(0)R", + 32 + ], + [ + "RS(1)", + 32 + ], + [ + "(S(0)S(1), S(0)S(1), S(0)S(1))", + 32 + ], + [ + "S(1)S(1)", + 3 + ] + ], + "forward": [ + [ + "S(0)S(2)", + 1378 + ], + [ + "S(0)S(1)", + 1227 + ], + [ + "S(0)S(0)", + 516 + ], + [ + "RR", + 324 + ], + [ + "RS(1)", + 258 + ], + [ + "S(0)R", + 66 + ], + [ + "RS(0)", + 64 + ], + [ + "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + 32 + ], + [ + "S(0)P(sum)", + 32 + ], + [ + "S(1)S(1)", + 2 + ], + [ + "S(2)S(2)", + 1 + ] + ] + }, + "placement_counts": [ + [ + "S(0)S(2)", + 3012 + ], + [ + "S(0)S(1)", + 2650 + ], + [ + "S(0)S(0)", + 774 + ], + [ + "RR", + 581 + ], + [ + "P(sum)S(0)", + 354 + ], + [ + "P(sum)P(sum)", + 291 + ], + [ + "RS(1)", + 290 + ], + [ + "P(sum)S(1)", + 225 + ], + [ + "RS(0)", + 193 + ], + [ + "S(0)P(sum)", + 129 + ], + [ + "S(0)R", + 98 + ], + [ + "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", + 32 + ], + [ + "(S(0)S(1), S(0)S(1), S(0)S(1))", + 32 + ], + [ + "S(1)S(1)", + 5 + ], + [ + "S(2)S(2)", + 1 + ] + ], + "sample_forward_interesting_nodes": [ + { + "inputs": [], + "module_path": "layers.0.attention.wq.weight", + "name": "primals_2", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.0.attention.wk.weight", + "name": "primals_3", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.0.attention.wv.weight", + "name": "primals_4", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.0.attention.wo.weight", + "name": "primals_5", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.0.feed_forward.w1.weight", + "name": "primals_6", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.0.feed_forward.w2.weight", + "name": "primals_7", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.0.feed_forward.w3.weight", + "name": "primals_8", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.1.attention.wq.weight", + "name": "primals_11", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.1.attention.wk.weight", + "name": "primals_12", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.1.attention.wv.weight", + "name": "primals_13", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.1.attention.wo.weight", + "name": "primals_14", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.1.feed_forward.w1.weight", + "name": "primals_15", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.1.feed_forward.w2.weight", + "name": "primals_16", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.1.feed_forward.w3.weight", + "name": "primals_17", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.2.attention.wq.weight", + "name": "primals_20", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.2.attention.wk.weight", + "name": "primals_21", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.2.attention.wv.weight", + "name": "primals_22", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.2.attention.wo.weight", + "name": "primals_23", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.2.feed_forward.w1.weight", + "name": "primals_24", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.2.feed_forward.w2.weight", + "name": "primals_25", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.2.feed_forward.w3.weight", + "name": "primals_26", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.3.attention.wq.weight", + "name": "primals_29", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.3.attention.wk.weight", + "name": "primals_30", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.3.attention.wv.weight", + "name": "primals_31", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.3.attention.wo.weight", + "name": "primals_32", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.3.feed_forward.w1.weight", + "name": "primals_33", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.3.feed_forward.w2.weight", + "name": "primals_34", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.3.feed_forward.w3.weight", + "name": "primals_35", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.4.attention.wq.weight", + "name": "primals_38", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.4.attention.wk.weight", + "name": "primals_39", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.4.attention.wv.weight", + "name": "primals_40", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.4.attention.wo.weight", + "name": "primals_41", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.4.feed_forward.w1.weight", + "name": "primals_42", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.4.feed_forward.w2.weight", + "name": "primals_43", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.4.feed_forward.w3.weight", + "name": "primals_44", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.5.attention.wq.weight", + "name": "primals_47", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.5.attention.wk.weight", + "name": "primals_48", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.5.attention.wv.weight", + "name": "primals_49", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.5.attention.wo.weight", + "name": "primals_50", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.5.feed_forward.w1.weight", + "name": "primals_51", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.5.feed_forward.w2.weight", + "name": "primals_52", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.5.feed_forward.w3.weight", + "name": "primals_53", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.6.attention.wq.weight", + "name": "primals_56", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.6.attention.wk.weight", + "name": "primals_57", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.6.attention.wv.weight", + "name": "primals_58", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.6.attention.wo.weight", + "name": "primals_59", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.6.feed_forward.w1.weight", + "name": "primals_60", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.6.feed_forward.w2.weight", + "name": "primals_61", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.6.feed_forward.w3.weight", + "name": "primals_62", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.7.attention.wq.weight", + "name": "primals_65", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.7.attention.wk.weight", + "name": "primals_66", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.7.attention.wv.weight", + "name": "primals_67", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.7.attention.wo.weight", + "name": "primals_68", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.7.feed_forward.w1.weight", + "name": "primals_69", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.7.feed_forward.w2.weight", + "name": "primals_70", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.7.feed_forward.w3.weight", + "name": "primals_71", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.8.attention.wq.weight", + "name": "primals_74", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.8.attention.wk.weight", + "name": "primals_75", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.8.attention.wv.weight", + "name": "primals_76", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.8.attention.wo.weight", + "name": "primals_77", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.8.feed_forward.w1.weight", + "name": "primals_78", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.8.feed_forward.w2.weight", + "name": "primals_79", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.8.feed_forward.w3.weight", + "name": "primals_80", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.9.attention.wq.weight", + "name": "primals_83", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.9.attention.wk.weight", + "name": "primals_84", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.9.attention.wv.weight", + "name": "primals_85", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.9.attention.wo.weight", + "name": "primals_86", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.9.feed_forward.w1.weight", + "name": "primals_87", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.9.feed_forward.w2.weight", + "name": "primals_88", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.9.feed_forward.w3.weight", + "name": "primals_89", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.10.attention.wq.weight", + "name": "primals_92", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.10.attention.wk.weight", + "name": "primals_93", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.10.attention.wv.weight", + "name": "primals_94", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.10.attention.wo.weight", + "name": "primals_95", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.10.feed_forward.w1.weight", + "name": "primals_96", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.10.feed_forward.w2.weight", + "name": "primals_97", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.10.feed_forward.w3.weight", + "name": "primals_98", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.11.attention.wq.weight", + "name": "primals_101", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.11.attention.wk.weight", + "name": "primals_102", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.11.attention.wv.weight", + "name": "primals_103", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.11.attention.wo.weight", + "name": "primals_104", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.11.feed_forward.w1.weight", + "name": "primals_105", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.11.feed_forward.w2.weight", + "name": "primals_106", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.11.feed_forward.w3.weight", + "name": "primals_107", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.12.attention.wq.weight", + "name": "primals_110", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.12.attention.wk.weight", + "name": "primals_111", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.12.attention.wv.weight", + "name": "primals_112", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.12.attention.wo.weight", + "name": "primals_113", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.12.feed_forward.w1.weight", + "name": "primals_114", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.12.feed_forward.w2.weight", + "name": "primals_115", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.12.feed_forward.w3.weight", + "name": "primals_116", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.13.attention.wq.weight", + "name": "primals_119", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.13.attention.wk.weight", + "name": "primals_120", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.13.attention.wv.weight", + "name": "primals_121", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.13.attention.wo.weight", + "name": "primals_122", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.13.feed_forward.w1.weight", + "name": "primals_123", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.13.feed_forward.w2.weight", + "name": "primals_124", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.13.feed_forward.w3.weight", + "name": "primals_125", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.14.attention.wq.weight", + "name": "primals_128", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.14.attention.wk.weight", + "name": "primals_129", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.14.attention.wv.weight", + "name": "primals_130", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.14.attention.wo.weight", + "name": "primals_131", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.14.feed_forward.w1.weight", + "name": "primals_132", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.14.feed_forward.w2.weight", + "name": "primals_133", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.14.feed_forward.w3.weight", + "name": "primals_134", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.15.attention.wq.weight", + "name": "primals_137", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.15.attention.wk.weight", + "name": "primals_138", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.15.attention.wv.weight", + "name": "primals_139", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.15.attention.wo.weight", + "name": "primals_140", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.15.feed_forward.w1.weight", + "name": "primals_141", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.15.feed_forward.w2.weight", + "name": "primals_142", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.15.feed_forward.w3.weight", + "name": "primals_143", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.16.attention.wq.weight", + "name": "primals_146", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.16.attention.wk.weight", + "name": "primals_147", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.16.attention.wv.weight", + "name": "primals_148", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.16.attention.wo.weight", + "name": "primals_149", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.16.feed_forward.w1.weight", + "name": "primals_150", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.16.feed_forward.w2.weight", + "name": "primals_151", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.16.feed_forward.w3.weight", + "name": "primals_152", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.17.attention.wq.weight", + "name": "primals_155", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.17.attention.wk.weight", + "name": "primals_156", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.17.attention.wv.weight", + "name": "primals_157", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.17.attention.wo.weight", + "name": "primals_158", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.17.feed_forward.w1.weight", + "name": "primals_159", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.17.feed_forward.w2.weight", + "name": "primals_160", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.17.feed_forward.w3.weight", + "name": "primals_161", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.18.attention.wq.weight", + "name": "primals_164", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.18.attention.wk.weight", + "name": "primals_165", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.18.attention.wv.weight", + "name": "primals_166", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.18.attention.wo.weight", + "name": "primals_167", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.18.feed_forward.w1.weight", + "name": "primals_168", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.18.feed_forward.w2.weight", + "name": "primals_169", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.18.feed_forward.w3.weight", + "name": "primals_170", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.19.attention.wq.weight", + "name": "primals_173", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.19.attention.wk.weight", + "name": "primals_174", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.19.attention.wv.weight", + "name": "primals_175", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.19.attention.wo.weight", + "name": "primals_176", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.19.feed_forward.w1.weight", + "name": "primals_177", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.19.feed_forward.w2.weight", + "name": "primals_178", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.19.feed_forward.w3.weight", + "name": "primals_179", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.20.attention.wq.weight", + "name": "primals_182", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.20.attention.wk.weight", + "name": "primals_183", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.20.attention.wv.weight", + "name": "primals_184", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.20.attention.wo.weight", + "name": "primals_185", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.20.feed_forward.w1.weight", + "name": "primals_186", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.20.feed_forward.w2.weight", + "name": "primals_187", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.20.feed_forward.w3.weight", + "name": "primals_188", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.21.attention.wq.weight", + "name": "primals_191", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.21.attention.wk.weight", + "name": "primals_192", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.21.attention.wv.weight", + "name": "primals_193", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.21.attention.wo.weight", + "name": "primals_194", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.21.feed_forward.w1.weight", + "name": "primals_195", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.21.feed_forward.w2.weight", + "name": "primals_196", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + }, + { + "inputs": [], + "module_path": "layers.21.feed_forward.w3.weight", + "name": "primals_197", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.22.attention.wq.weight", + "name": "primals_200", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.22.attention.wk.weight", + "name": "primals_201", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.22.attention.wv.weight", + "name": "primals_202", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 1024, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.22.attention.wo.weight", + "name": "primals_203", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 4096, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.22.feed_forward.w1.weight", + "name": "primals_204", + "op": "placeholder", + "placement": "S(0)S(0)", + "shape": [ + 14336, + 4096 + ] + }, + { + "inputs": [], + "module_path": "layers.22.feed_forward.w2.weight", + "name": "primals_205", + "op": "placeholder", + "placement": "S(0)S(1)", + "shape": [ + 4096, + 14336 + ] + } + ] +} \ No newline at end of file diff --git a/profile_results/real_llama3_3b_dag_node_stats.csv b/profile_results/real_llama3_3b_dag_node_stats.csv new file mode 100644 index 00000000..5f813f1b --- /dev/null +++ b/profile_results/real_llama3_3b_dag_node_stats.csv @@ -0,0 +1,7200 @@ +idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count +0,primals_1,placeholder,primals_1,unknown,,0,0,1,0,5816,3 +1,primals_2,placeholder,primals_2,unknown,,0,0,1,0,5777,3 +2,primals_3,placeholder,primals_3,unknown,,0,0,1,0,5777,3 +3,primals_4,placeholder,primals_4,unknown,,0,0,1,0,5770,3 +4,primals_5,placeholder,primals_5,unknown,,0,0,1,0,5757,3 +5,primals_6,placeholder,primals_6,unknown,,0,0,1,0,5737,3 +6,primals_7,placeholder,primals_7,unknown,,0,0,1,0,5714,3 +7,primals_8,placeholder,primals_8,unknown,,0,0,1,0,5718,3 +8,primals_9,placeholder,primals_9,unknown,,0,0,1,0,5794,2 +9,primals_10,placeholder,primals_10,unknown,,0,0,1,0,5741,2 +10,primals_11,placeholder,primals_11,unknown,,0,0,1,0,5681,3 +11,primals_12,placeholder,primals_12,unknown,,0,0,1,0,5681,3 +12,primals_13,placeholder,primals_13,unknown,,0,0,1,0,5674,3 +13,primals_14,placeholder,primals_14,unknown,,0,0,1,0,5661,3 +14,primals_15,placeholder,primals_15,unknown,,0,0,1,0,5641,3 +15,primals_16,placeholder,primals_16,unknown,,0,0,1,0,5618,3 +16,primals_17,placeholder,primals_17,unknown,,0,0,1,0,5622,3 +17,primals_18,placeholder,primals_18,unknown,,0,0,1,0,5698,2 +18,primals_19,placeholder,primals_19,unknown,,0,0,1,0,5645,2 +19,primals_20,placeholder,primals_20,unknown,,0,0,1,0,5585,3 +20,primals_21,placeholder,primals_21,unknown,,0,0,1,0,5585,3 +21,primals_22,placeholder,primals_22,unknown,,0,0,1,0,5578,3 +22,primals_23,placeholder,primals_23,unknown,,0,0,1,0,5565,3 +23,primals_24,placeholder,primals_24,unknown,,0,0,1,0,5545,3 +24,primals_25,placeholder,primals_25,unknown,,0,0,1,0,5522,3 +25,primals_26,placeholder,primals_26,unknown,,0,0,1,0,5526,3 +26,primals_27,placeholder,primals_27,unknown,,0,0,1,0,5602,2 +27,primals_28,placeholder,primals_28,unknown,,0,0,1,0,5549,2 +28,primals_29,placeholder,primals_29,unknown,,0,0,1,0,5489,3 +29,primals_30,placeholder,primals_30,unknown,,0,0,1,0,5489,3 +30,primals_31,placeholder,primals_31,unknown,,0,0,1,0,5482,3 +31,primals_32,placeholder,primals_32,unknown,,0,0,1,0,5469,3 +32,primals_33,placeholder,primals_33,unknown,,0,0,1,0,5449,3 +33,primals_34,placeholder,primals_34,unknown,,0,0,1,0,5426,3 +34,primals_35,placeholder,primals_35,unknown,,0,0,1,0,5430,3 +35,primals_36,placeholder,primals_36,unknown,,0,0,1,0,5506,2 +36,primals_37,placeholder,primals_37,unknown,,0,0,1,0,5453,2 +37,primals_38,placeholder,primals_38,unknown,,0,0,1,0,5393,3 +38,primals_39,placeholder,primals_39,unknown,,0,0,1,0,5393,3 +39,primals_40,placeholder,primals_40,unknown,,0,0,1,0,5386,3 +40,primals_41,placeholder,primals_41,unknown,,0,0,1,0,5373,3 +41,primals_42,placeholder,primals_42,unknown,,0,0,1,0,5353,3 +42,primals_43,placeholder,primals_43,unknown,,0,0,1,0,5330,3 +43,primals_44,placeholder,primals_44,unknown,,0,0,1,0,5334,3 +44,primals_45,placeholder,primals_45,unknown,,0,0,1,0,5410,2 +45,primals_46,placeholder,primals_46,unknown,,0,0,1,0,5357,2 +46,primals_47,placeholder,primals_47,unknown,,0,0,1,0,5297,3 +47,primals_48,placeholder,primals_48,unknown,,0,0,1,0,5297,3 +48,primals_49,placeholder,primals_49,unknown,,0,0,1,0,5290,3 +49,primals_50,placeholder,primals_50,unknown,,0,0,1,0,5277,3 +50,primals_51,placeholder,primals_51,unknown,,0,0,1,0,5257,3 +51,primals_52,placeholder,primals_52,unknown,,0,0,1,0,5234,3 +52,primals_53,placeholder,primals_53,unknown,,0,0,1,0,5238,3 +53,primals_54,placeholder,primals_54,unknown,,0,0,1,0,5314,2 +54,primals_55,placeholder,primals_55,unknown,,0,0,1,0,5261,2 +55,primals_56,placeholder,primals_56,unknown,,0,0,1,0,5201,3 +56,primals_57,placeholder,primals_57,unknown,,0,0,1,0,5201,3 +57,primals_58,placeholder,primals_58,unknown,,0,0,1,0,5194,3 +58,primals_59,placeholder,primals_59,unknown,,0,0,1,0,5181,3 +59,primals_60,placeholder,primals_60,unknown,,0,0,1,0,5161,3 +60,primals_61,placeholder,primals_61,unknown,,0,0,1,0,5138,3 +61,primals_62,placeholder,primals_62,unknown,,0,0,1,0,5142,3 +62,primals_63,placeholder,primals_63,unknown,,0,0,1,0,5218,2 +63,primals_64,placeholder,primals_64,unknown,,0,0,1,0,5165,2 +64,primals_65,placeholder,primals_65,unknown,,0,0,1,0,5105,3 +65,primals_66,placeholder,primals_66,unknown,,0,0,1,0,5105,3 +66,primals_67,placeholder,primals_67,unknown,,0,0,1,0,5098,3 +67,primals_68,placeholder,primals_68,unknown,,0,0,1,0,5085,3 +68,primals_69,placeholder,primals_69,unknown,,0,0,1,0,5065,3 +69,primals_70,placeholder,primals_70,unknown,,0,0,1,0,5042,3 +70,primals_71,placeholder,primals_71,unknown,,0,0,1,0,5046,3 +71,primals_72,placeholder,primals_72,unknown,,0,0,1,0,5122,2 +72,primals_73,placeholder,primals_73,unknown,,0,0,1,0,5069,2 +73,primals_74,placeholder,primals_74,unknown,,0,0,1,0,5009,3 +74,primals_75,placeholder,primals_75,unknown,,0,0,1,0,5009,3 +75,primals_76,placeholder,primals_76,unknown,,0,0,1,0,5002,3 +76,primals_77,placeholder,primals_77,unknown,,0,0,1,0,4989,3 +77,primals_78,placeholder,primals_78,unknown,,0,0,1,0,4969,3 +78,primals_79,placeholder,primals_79,unknown,,0,0,1,0,4946,3 +79,primals_80,placeholder,primals_80,unknown,,0,0,1,0,4950,3 +80,primals_81,placeholder,primals_81,unknown,,0,0,1,0,5026,2 +81,primals_82,placeholder,primals_82,unknown,,0,0,1,0,4973,2 +82,primals_83,placeholder,primals_83,unknown,,0,0,1,0,4913,3 +83,primals_84,placeholder,primals_84,unknown,,0,0,1,0,4913,3 +84,primals_85,placeholder,primals_85,unknown,,0,0,1,0,4906,3 +85,primals_86,placeholder,primals_86,unknown,,0,0,1,0,4893,3 +86,primals_87,placeholder,primals_87,unknown,,0,0,1,0,4873,3 +87,primals_88,placeholder,primals_88,unknown,,0,0,1,0,4850,3 +88,primals_89,placeholder,primals_89,unknown,,0,0,1,0,4854,3 +89,primals_90,placeholder,primals_90,unknown,,0,0,1,0,4930,2 +90,primals_91,placeholder,primals_91,unknown,,0,0,1,0,4877,2 +91,primals_92,placeholder,primals_92,unknown,,0,0,1,0,4817,3 +92,primals_93,placeholder,primals_93,unknown,,0,0,1,0,4817,3 +93,primals_94,placeholder,primals_94,unknown,,0,0,1,0,4810,3 +94,primals_95,placeholder,primals_95,unknown,,0,0,1,0,4797,3 +95,primals_96,placeholder,primals_96,unknown,,0,0,1,0,4777,3 +96,primals_97,placeholder,primals_97,unknown,,0,0,1,0,4754,3 +97,primals_98,placeholder,primals_98,unknown,,0,0,1,0,4758,3 +98,primals_99,placeholder,primals_99,unknown,,0,0,1,0,4834,2 +99,primals_100,placeholder,primals_100,unknown,,0,0,1,0,4781,2 +100,primals_101,placeholder,primals_101,unknown,,0,0,1,0,4721,3 +101,primals_102,placeholder,primals_102,unknown,,0,0,1,0,4721,3 +102,primals_103,placeholder,primals_103,unknown,,0,0,1,0,4714,3 +103,primals_104,placeholder,primals_104,unknown,,0,0,1,0,4701,3 +104,primals_105,placeholder,primals_105,unknown,,0,0,1,0,4681,3 +105,primals_106,placeholder,primals_106,unknown,,0,0,1,0,4658,3 +106,primals_107,placeholder,primals_107,unknown,,0,0,1,0,4662,3 +107,primals_108,placeholder,primals_108,unknown,,0,0,1,0,4738,2 +108,primals_109,placeholder,primals_109,unknown,,0,0,1,0,4685,2 +109,primals_110,placeholder,primals_110,unknown,,0,0,1,0,4625,3 +110,primals_111,placeholder,primals_111,unknown,,0,0,1,0,4625,3 +111,primals_112,placeholder,primals_112,unknown,,0,0,1,0,4618,3 +112,primals_113,placeholder,primals_113,unknown,,0,0,1,0,4605,3 +113,primals_114,placeholder,primals_114,unknown,,0,0,1,0,4585,3 +114,primals_115,placeholder,primals_115,unknown,,0,0,1,0,4562,3 +115,primals_116,placeholder,primals_116,unknown,,0,0,1,0,4566,3 +116,primals_117,placeholder,primals_117,unknown,,0,0,1,0,4642,2 +117,primals_118,placeholder,primals_118,unknown,,0,0,1,0,4589,2 +118,primals_119,placeholder,primals_119,unknown,,0,0,1,0,4529,3 +119,primals_120,placeholder,primals_120,unknown,,0,0,1,0,4529,3 +120,primals_121,placeholder,primals_121,unknown,,0,0,1,0,4522,3 +121,primals_122,placeholder,primals_122,unknown,,0,0,1,0,4509,3 +122,primals_123,placeholder,primals_123,unknown,,0,0,1,0,4489,3 +123,primals_124,placeholder,primals_124,unknown,,0,0,1,0,4466,3 +124,primals_125,placeholder,primals_125,unknown,,0,0,1,0,4470,3 +125,primals_126,placeholder,primals_126,unknown,,0,0,1,0,4546,2 +126,primals_127,placeholder,primals_127,unknown,,0,0,1,0,4493,2 +127,primals_128,placeholder,primals_128,unknown,,0,0,1,0,4433,3 +128,primals_129,placeholder,primals_129,unknown,,0,0,1,0,4433,3 +129,primals_130,placeholder,primals_130,unknown,,0,0,1,0,4426,3 +130,primals_131,placeholder,primals_131,unknown,,0,0,1,0,4413,3 +131,primals_132,placeholder,primals_132,unknown,,0,0,1,0,4393,3 +132,primals_133,placeholder,primals_133,unknown,,0,0,1,0,4370,3 +133,primals_134,placeholder,primals_134,unknown,,0,0,1,0,4374,3 +134,primals_135,placeholder,primals_135,unknown,,0,0,1,0,4450,2 +135,primals_136,placeholder,primals_136,unknown,,0,0,1,0,4397,2 +136,primals_137,placeholder,primals_137,unknown,,0,0,1,0,4337,3 +137,primals_138,placeholder,primals_138,unknown,,0,0,1,0,4337,3 +138,primals_139,placeholder,primals_139,unknown,,0,0,1,0,4330,3 +139,primals_140,placeholder,primals_140,unknown,,0,0,1,0,4317,3 +140,primals_141,placeholder,primals_141,unknown,,0,0,1,0,4297,3 +141,primals_142,placeholder,primals_142,unknown,,0,0,1,0,4274,3 +142,primals_143,placeholder,primals_143,unknown,,0,0,1,0,4278,3 +143,primals_144,placeholder,primals_144,unknown,,0,0,1,0,4354,2 +144,primals_145,placeholder,primals_145,unknown,,0,0,1,0,4301,2 +145,primals_146,placeholder,primals_146,unknown,,0,0,1,0,4241,3 +146,primals_147,placeholder,primals_147,unknown,,0,0,1,0,4241,3 +147,primals_148,placeholder,primals_148,unknown,,0,0,1,0,4234,3 +148,primals_149,placeholder,primals_149,unknown,,0,0,1,0,4221,3 +149,primals_150,placeholder,primals_150,unknown,,0,0,1,0,4201,3 +150,primals_151,placeholder,primals_151,unknown,,0,0,1,0,4178,3 +151,primals_152,placeholder,primals_152,unknown,,0,0,1,0,4182,3 +152,primals_153,placeholder,primals_153,unknown,,0,0,1,0,4258,2 +153,primals_154,placeholder,primals_154,unknown,,0,0,1,0,4205,2 +154,primals_155,placeholder,primals_155,unknown,,0,0,1,0,4145,3 +155,primals_156,placeholder,primals_156,unknown,,0,0,1,0,4145,3 +156,primals_157,placeholder,primals_157,unknown,,0,0,1,0,4138,3 +157,primals_158,placeholder,primals_158,unknown,,0,0,1,0,4125,3 +158,primals_159,placeholder,primals_159,unknown,,0,0,1,0,4105,3 +159,primals_160,placeholder,primals_160,unknown,,0,0,1,0,4082,3 +160,primals_161,placeholder,primals_161,unknown,,0,0,1,0,4086,3 +161,primals_162,placeholder,primals_162,unknown,,0,0,1,0,4162,2 +162,primals_163,placeholder,primals_163,unknown,,0,0,1,0,4109,2 +163,primals_164,placeholder,primals_164,unknown,,0,0,1,0,4049,3 +164,primals_165,placeholder,primals_165,unknown,,0,0,1,0,4049,3 +165,primals_166,placeholder,primals_166,unknown,,0,0,1,0,4042,3 +166,primals_167,placeholder,primals_167,unknown,,0,0,1,0,4029,3 +167,primals_168,placeholder,primals_168,unknown,,0,0,1,0,4009,3 +168,primals_169,placeholder,primals_169,unknown,,0,0,1,0,3986,3 +169,primals_170,placeholder,primals_170,unknown,,0,0,1,0,3990,3 +170,primals_171,placeholder,primals_171,unknown,,0,0,1,0,4066,2 +171,primals_172,placeholder,primals_172,unknown,,0,0,1,0,4013,2 +172,primals_173,placeholder,primals_173,unknown,,0,0,1,0,3953,3 +173,primals_174,placeholder,primals_174,unknown,,0,0,1,0,3953,3 +174,primals_175,placeholder,primals_175,unknown,,0,0,1,0,3946,3 +175,primals_176,placeholder,primals_176,unknown,,0,0,1,0,3933,3 +176,primals_177,placeholder,primals_177,unknown,,0,0,1,0,3913,3 +177,primals_178,placeholder,primals_178,unknown,,0,0,1,0,3890,3 +178,primals_179,placeholder,primals_179,unknown,,0,0,1,0,3894,3 +179,primals_180,placeholder,primals_180,unknown,,0,0,1,0,3970,2 +180,primals_181,placeholder,primals_181,unknown,,0,0,1,0,3917,2 +181,primals_182,placeholder,primals_182,unknown,,0,0,1,0,3857,3 +182,primals_183,placeholder,primals_183,unknown,,0,0,1,0,3857,3 +183,primals_184,placeholder,primals_184,unknown,,0,0,1,0,3850,3 +184,primals_185,placeholder,primals_185,unknown,,0,0,1,0,3837,3 +185,primals_186,placeholder,primals_186,unknown,,0,0,1,0,3817,3 +186,primals_187,placeholder,primals_187,unknown,,0,0,1,0,3794,3 +187,primals_188,placeholder,primals_188,unknown,,0,0,1,0,3798,3 +188,primals_189,placeholder,primals_189,unknown,,0,0,1,0,3874,2 +189,primals_190,placeholder,primals_190,unknown,,0,0,1,0,3821,2 +190,primals_191,placeholder,primals_191,unknown,,0,0,1,0,3761,3 +191,primals_192,placeholder,primals_192,unknown,,0,0,1,0,3761,3 +192,primals_193,placeholder,primals_193,unknown,,0,0,1,0,3754,3 +193,primals_194,placeholder,primals_194,unknown,,0,0,1,0,3741,3 +194,primals_195,placeholder,primals_195,unknown,,0,0,1,0,3721,3 +195,primals_196,placeholder,primals_196,unknown,,0,0,1,0,3698,3 +196,primals_197,placeholder,primals_197,unknown,,0,0,1,0,3702,3 +197,primals_198,placeholder,primals_198,unknown,,0,0,1,0,3778,2 +198,primals_199,placeholder,primals_199,unknown,,0,0,1,0,3725,2 +199,primals_200,placeholder,primals_200,unknown,,0,0,1,0,3665,3 +200,primals_201,placeholder,primals_201,unknown,,0,0,1,0,3665,3 +201,primals_202,placeholder,primals_202,unknown,,0,0,1,0,3658,3 +202,primals_203,placeholder,primals_203,unknown,,0,0,1,0,3645,3 +203,primals_204,placeholder,primals_204,unknown,,0,0,1,0,3625,3 +204,primals_205,placeholder,primals_205,unknown,,0,0,1,0,3602,3 +205,primals_206,placeholder,primals_206,unknown,,0,0,1,0,3606,3 +206,primals_207,placeholder,primals_207,unknown,,0,0,1,0,3682,2 +207,primals_208,placeholder,primals_208,unknown,,0,0,1,0,3629,2 +208,primals_209,placeholder,primals_209,unknown,,0,0,1,0,3569,3 +209,primals_210,placeholder,primals_210,unknown,,0,0,1,0,3569,3 +210,primals_211,placeholder,primals_211,unknown,,0,0,1,0,3562,3 +211,primals_212,placeholder,primals_212,unknown,,0,0,1,0,3549,3 +212,primals_213,placeholder,primals_213,unknown,,0,0,1,0,3529,3 +213,primals_214,placeholder,primals_214,unknown,,0,0,1,0,3506,3 +214,primals_215,placeholder,primals_215,unknown,,0,0,1,0,3510,3 +215,primals_216,placeholder,primals_216,unknown,,0,0,1,0,3586,2 +216,primals_217,placeholder,primals_217,unknown,,0,0,1,0,3533,2 +217,primals_218,placeholder,primals_218,unknown,,0,0,1,0,3473,3 +218,primals_219,placeholder,primals_219,unknown,,0,0,1,0,3473,3 +219,primals_220,placeholder,primals_220,unknown,,0,0,1,0,3466,3 +220,primals_221,placeholder,primals_221,unknown,,0,0,1,0,3453,3 +221,primals_222,placeholder,primals_222,unknown,,0,0,1,0,3433,3 +222,primals_223,placeholder,primals_223,unknown,,0,0,1,0,3410,3 +223,primals_224,placeholder,primals_224,unknown,,0,0,1,0,3414,3 +224,primals_225,placeholder,primals_225,unknown,,0,0,1,0,3490,2 +225,primals_226,placeholder,primals_226,unknown,,0,0,1,0,3437,2 +226,primals_227,placeholder,primals_227,unknown,,0,0,1,0,3377,3 +227,primals_228,placeholder,primals_228,unknown,,0,0,1,0,3377,3 +228,primals_229,placeholder,primals_229,unknown,,0,0,1,0,3370,3 +229,primals_230,placeholder,primals_230,unknown,,0,0,1,0,3357,3 +230,primals_231,placeholder,primals_231,unknown,,0,0,1,0,3337,3 +231,primals_232,placeholder,primals_232,unknown,,0,0,1,0,3314,3 +232,primals_233,placeholder,primals_233,unknown,,0,0,1,0,3318,3 +233,primals_234,placeholder,primals_234,unknown,,0,0,1,0,3394,2 +234,primals_235,placeholder,primals_235,unknown,,0,0,1,0,3341,2 +235,primals_236,placeholder,primals_236,unknown,,0,0,1,0,3281,3 +236,primals_237,placeholder,primals_237,unknown,,0,0,1,0,3281,3 +237,primals_238,placeholder,primals_238,unknown,,0,0,1,0,3274,3 +238,primals_239,placeholder,primals_239,unknown,,0,0,1,0,3261,3 +239,primals_240,placeholder,primals_240,unknown,,0,0,1,0,3241,3 +240,primals_241,placeholder,primals_241,unknown,,0,0,1,0,3218,3 +241,primals_242,placeholder,primals_242,unknown,,0,0,1,0,3222,3 +242,primals_243,placeholder,primals_243,unknown,,0,0,1,0,3298,2 +243,primals_244,placeholder,primals_244,unknown,,0,0,1,0,3245,2 +244,primals_245,placeholder,primals_245,unknown,,0,0,1,0,3185,3 +245,primals_246,placeholder,primals_246,unknown,,0,0,1,0,3185,3 +246,primals_247,placeholder,primals_247,unknown,,0,0,1,0,3178,3 +247,primals_248,placeholder,primals_248,unknown,,0,0,1,0,3165,3 +248,primals_249,placeholder,primals_249,unknown,,0,0,1,0,3145,3 +249,primals_250,placeholder,primals_250,unknown,,0,0,1,0,3122,3 +250,primals_251,placeholder,primals_251,unknown,,0,0,1,0,3126,3 +251,primals_252,placeholder,primals_252,unknown,,0,0,1,0,3202,2 +252,primals_253,placeholder,primals_253,unknown,,0,0,1,0,3149,2 +253,primals_254,placeholder,primals_254,unknown,,0,0,1,0,3103,2 +254,primals_255,placeholder,primals_255,unknown,,0,0,1,0,5943,3 +255,primals_256,placeholder,primals_256,unknown,,0,0,1,0,5806,3 +256,tangents_1,placeholder,tangents_1,backward,,0,0,1,0,3104,4 +257,alias_default,call_function,alias.default,unknown,,1,1,2,1,5815,3 +258,dtype_cast,call_function,dtype_cast.default,forward,,1,1,1,2,5805,3 +259,alias_default_2,call_function,alias.default,unknown,,1,1,2,1,5805,3 +260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5 +261,dtype_cast_1,call_function,dtype_cast.default,forward,0,1,1,1,1,5793,2 +262,alias_default_4,call_function,alias.default,forward,,1,1,3,6,5803,4 +263,convert_element_type,call_function,convert_element_type.default,forward,0,1,1,1,7,5801,4 +264,alias_default_6,call_function,alias.default,forward,0,1,1,2,8,5800,4 +265,pow_1,call_function,pow.Tensor_Scalar,forward,0,1,1,1,9,5799,4 +266,mean,call_function,mean.dim,forward,0,1,1,1,10,5798,4 +267,add,call_function,add.Scalar,forward,0,1,1,1,11,5797,3 +268,rsqrt,call_function,rsqrt.default,forward,0,1,1,1,12,5796,3 +269,alias_default_7,call_function,alias.default,forward,0,1,1,3,13,5795,3 +270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8 +271,alias_default_5,call_function,alias.default,forward,0,1,1,2,2,5792,2 +272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8 +273,convert_element_type_1,call_function,convert_element_type.default,forward,0,1,1,1,19,5789,6 +274,dtype_cast_2,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3 +275,permute,call_function,permute.default,forward,0,1,1,1,2,5775,3 +276,alias_default_8,call_function,alias.default,forward,0,1,1,6,20,5788,4 +277,alias_default_9,call_function,alias.default,forward,0,1,1,2,3,5774,3 +278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5 +279,dtype_cast_3,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3 +280,permute_1,call_function,permute.default,forward,0,1,1,1,2,5775,3 +281,alias_default_10,call_function,alias.default,forward,0,1,1,2,3,5774,3 +282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5 +283,dtype_cast_4,call_function,dtype_cast.default,forward,0,1,1,1,1,5769,3 +284,permute_2,call_function,permute.default,forward,0,1,1,1,2,5768,3 +285,alias_default_11,call_function,alias.default,forward,0,1,1,2,3,5767,3 +286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5 +287,view_6,call_function,view.default,forward,0,1,1,1,26,5771,4 +288,view_7,call_function,view.default,forward,0,1,1,1,26,5771,4 +289,view_8,call_function,view.default,forward,0,1,1,1,26,5764,4 +290,convert_element_type_8,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4 +291,view_9,call_function,view.default,forward,0,1,1,1,28,5769,4 +292,view_as_complex,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6 +293,convert_element_type_9,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4 +294,view_10,call_function,view.default,forward,0,1,1,1,28,5769,4 +295,view_as_complex_1,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6 +296,alias_default_1,call_function,alias.default,unknown,,1,1,28,1,5942,3 +297,view_11,call_function,view.default,forward,0,1,1,1,2,5779,3 +298,alias_default_12,call_function,alias.default,forward,0,1,1,4,3,5778,3 +299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 +300,view_as_real,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6 +301,view_12,call_function,view.default,forward,0,1,1,1,36,5765,6 +302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 +303,view_as_real_1,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6 +304,view_13,call_function,view.default,forward,0,1,1,1,36,5765,6 +305,convert_element_type_10,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6 +306,convert_element_type_11,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6 +307,permute_3,call_function,permute.default,forward,0,1,1,1,38,5763,6 +308,permute_4,call_function,permute.default,forward,0,1,1,1,38,5763,6 +309,permute_5,call_function,permute.default,forward,0,1,1,1,27,5763,4 +310,alias_default_13,call_function,alias.default,forward,0,1,1,2,39,5762,4 +311,alias_default_14,call_function,alias.default,forward,0,1,1,2,39,5762,4 +312,alias_default_15,call_function,alias.default,forward,0,1,1,2,28,5762,4 +313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2 +314,getitem,call_function,getitem,forward,0,1,1,1,64,5757,2 +315,getitem_1,call_function,getitem,forward,0,1,1,1,64,64,2 +316,getitem_6,call_function,getitem,forward,0,1,1,1,64,64,1 +317,getitem_7,call_function,getitem,forward,0,1,1,1,64,64,1 +318,alias_default_16,call_function,alias.default,forward,0,1,1,2,65,5756,4 +319,permute_6,call_function,permute.default,forward,0,1,1,1,66,5755,4 +320,view_14,call_function,view.default,forward,0,1,1,1,67,5754,3 +321,dtype_cast_5,call_function,dtype_cast.default,forward,0,1,1,1,1,5756,3 +322,permute_7,call_function,permute.default,forward,0,1,1,1,2,5755,3 +323,alias_default_17,call_function,alias.default,forward,0,1,1,2,68,5753,4 +324,alias_default_18,call_function,alias.default,forward,0,1,1,2,3,5754,3 +325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5 +326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10 +327,dtype_cast_6,call_function,dtype_cast.default,forward,0,1,1,1,1,5740,2 +328,alias_default_19,call_function,alias.default,forward,0,1,1,3,75,5750,4 +329,convert_element_type_14,call_function,convert_element_type.default,forward,0,1,1,1,76,5748,4 +330,alias_default_21,call_function,alias.default,forward,0,1,1,2,77,5747,4 +331,pow_2,call_function,pow.Tensor_Scalar,forward,0,1,1,1,78,5746,4 +332,mean_1,call_function,mean.dim,forward,0,1,1,1,79,5745,4 +333,add_2,call_function,add.Scalar,forward,0,1,1,1,80,5744,3 +334,rsqrt_1,call_function,rsqrt.default,forward,0,1,1,1,81,5743,3 +335,alias_default_22,call_function,alias.default,forward,0,1,1,3,82,5742,3 +336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8 +337,alias_default_20,call_function,alias.default,forward,0,1,1,2,2,5739,2 +338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8 +339,convert_element_type_15,call_function,convert_element_type.default,forward,0,1,1,1,88,5736,6 +340,dtype_cast_7,call_function,dtype_cast.default,forward,0,1,1,1,1,5736,3 +341,permute_8,call_function,permute.default,forward,0,1,1,1,2,5735,3 +342,alias_default_23,call_function,alias.default,forward,0,1,1,4,89,5735,4 +343,alias_default_24,call_function,alias.default,forward,0,1,1,2,3,5734,3 +344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5 +345,alias_default_25,call_function,alias.default,forward,0,1,1,2,95,5731,4 +346,convert_element_type_18,call_function,convert_element_type.default,forward,0,1,1,1,96,5719,4 +347,alias_default_26,call_function,alias.default,forward,0,1,1,2,97,5718,4 +348,neg,call_function,neg.default,forward,0,1,1,1,98,5717,8 +349,exp,call_function,exp.default,forward,0,1,1,1,99,5716,6 +350,add_3,call_function,add.Tensor,forward,0,1,1,1,100,5715,4 +351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6 +352,convert_element_type_19,call_function,convert_element_type.default,forward,0,1,1,1,102,5713,6 +353,dtype_cast_8,call_function,dtype_cast.default,forward,0,1,1,1,1,5717,3 +354,permute_9,call_function,permute.default,forward,0,1,1,1,2,5716,3 +355,alias_default_28,call_function,alias.default,forward,0,1,1,2,3,5715,3 +356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5 +357,alias_default_27,call_function,alias.default,forward,0,1,1,2,103,5712,4 +358,alias_default_29,call_function,alias.default,forward,0,1,1,2,95,5712,4 +359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8 +360,dtype_cast_9,call_function,dtype_cast.default,forward,0,1,1,1,1,5713,3 +361,permute_10,call_function,permute.default,forward,0,1,1,1,2,5712,3 +362,alias_default_30,call_function,alias.default,forward,0,1,1,2,111,5710,4 +363,alias_default_31,call_function,alias.default,forward,0,1,1,2,3,5711,3 +364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5 +365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10 +366,dtype_cast_10,call_function,dtype_cast.default,forward,1,1,1,1,1,5697,2 +367,alias_default_32,call_function,alias.default,forward,0,1,1,3,118,5707,4 +368,convert_element_type_24,call_function,convert_element_type.default,forward,1,1,1,1,119,5705,4 +369,alias_default_34,call_function,alias.default,forward,1,1,1,2,120,5704,4 +370,pow_3,call_function,pow.Tensor_Scalar,forward,1,1,1,1,121,5703,4 +371,mean_2,call_function,mean.dim,forward,1,1,1,1,122,5702,4 +372,add_5,call_function,add.Scalar,forward,1,1,1,1,123,5701,3 +373,rsqrt_2,call_function,rsqrt.default,forward,1,1,1,1,124,5700,3 +374,alias_default_35,call_function,alias.default,forward,1,1,1,3,125,5699,3 +375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8 +376,alias_default_33,call_function,alias.default,forward,1,1,1,2,2,5696,2 +377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8 +378,convert_element_type_25,call_function,convert_element_type.default,forward,1,1,1,1,131,5693,6 +379,dtype_cast_11,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3 +380,permute_11,call_function,permute.default,forward,1,1,1,1,2,5679,3 +381,alias_default_36,call_function,alias.default,forward,1,1,1,6,132,5692,4 +382,alias_default_37,call_function,alias.default,forward,1,1,1,2,3,5678,3 +383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5 +384,dtype_cast_12,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3 +385,permute_12,call_function,permute.default,forward,1,1,1,1,2,5679,3 +386,alias_default_38,call_function,alias.default,forward,1,1,1,2,3,5678,3 +387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5 +388,dtype_cast_13,call_function,dtype_cast.default,forward,1,1,1,1,1,5673,3 +389,permute_13,call_function,permute.default,forward,1,1,1,1,2,5672,3 +390,alias_default_39,call_function,alias.default,forward,1,1,1,2,3,5671,3 +391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5 +392,view_29,call_function,view.default,forward,1,1,1,1,138,5675,4 +393,view_30,call_function,view.default,forward,1,1,1,1,138,5675,4 +394,view_31,call_function,view.default,forward,1,1,1,1,138,5668,4 +395,convert_element_type_32,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4 +396,view_32,call_function,view.default,forward,1,1,1,1,140,5673,4 +397,view_as_complex_2,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6 +398,convert_element_type_33,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4 +399,view_33,call_function,view.default,forward,1,1,1,1,140,5673,4 +400,view_as_complex_3,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6 +401,view_34,call_function,view.default,forward,1,1,1,1,2,5683,3 +402,alias_default_40,call_function,alias.default,forward,1,1,1,4,3,5682,3 +403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 +404,view_as_real_2,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6 +405,view_35,call_function,view.default,forward,1,1,1,1,146,5669,6 +406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 +407,view_as_real_3,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6 +408,view_36,call_function,view.default,forward,1,1,1,1,146,5669,6 +409,convert_element_type_34,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6 +410,convert_element_type_35,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6 +411,permute_14,call_function,permute.default,forward,1,1,1,1,148,5667,6 +412,permute_15,call_function,permute.default,forward,1,1,1,1,148,5667,6 +413,permute_16,call_function,permute.default,forward,1,1,1,1,139,5667,4 +414,alias_default_41,call_function,alias.default,forward,1,1,1,2,149,5666,4 +415,alias_default_42,call_function,alias.default,forward,1,1,1,2,149,5666,4 +416,alias_default_43,call_function,alias.default,forward,1,1,1,2,140,5666,4 +417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2 +418,getitem_9,call_function,getitem,forward,1,1,1,1,174,5661,2 +419,getitem_10,call_function,getitem,forward,1,1,1,1,174,174,2 +420,getitem_15,call_function,getitem,forward,1,1,1,1,174,174,1 +421,getitem_16,call_function,getitem,forward,1,1,1,1,174,174,1 +422,alias_default_44,call_function,alias.default,forward,1,1,1,2,175,5660,4 +423,permute_17,call_function,permute.default,forward,1,1,1,1,176,5659,4 +424,view_37,call_function,view.default,forward,1,1,1,1,177,5658,3 +425,dtype_cast_14,call_function,dtype_cast.default,forward,1,1,1,1,1,5660,3 +426,permute_18,call_function,permute.default,forward,1,1,1,1,2,5659,3 +427,alias_default_45,call_function,alias.default,forward,1,1,1,2,178,5657,4 +428,alias_default_46,call_function,alias.default,forward,1,1,1,2,3,5658,3 +429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5 +430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10 +431,dtype_cast_15,call_function,dtype_cast.default,forward,1,1,1,1,1,5644,2 +432,alias_default_47,call_function,alias.default,forward,1,1,1,3,185,5654,4 +433,convert_element_type_38,call_function,convert_element_type.default,forward,1,1,1,1,186,5652,4 +434,alias_default_49,call_function,alias.default,forward,1,1,1,2,187,5651,4 +435,pow_4,call_function,pow.Tensor_Scalar,forward,1,1,1,1,188,5650,4 +436,mean_3,call_function,mean.dim,forward,1,1,1,1,189,5649,4 +437,add_7,call_function,add.Scalar,forward,1,1,1,1,190,5648,3 +438,rsqrt_3,call_function,rsqrt.default,forward,1,1,1,1,191,5647,3 +439,alias_default_50,call_function,alias.default,forward,1,1,1,3,192,5646,3 +440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8 +441,alias_default_48,call_function,alias.default,forward,1,1,1,2,2,5643,2 +442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8 +443,convert_element_type_39,call_function,convert_element_type.default,forward,1,1,1,1,198,5640,6 +444,dtype_cast_16,call_function,dtype_cast.default,forward,1,1,1,1,1,5640,3 +445,permute_19,call_function,permute.default,forward,1,1,1,1,2,5639,3 +446,alias_default_51,call_function,alias.default,forward,1,1,1,4,199,5639,4 +447,alias_default_52,call_function,alias.default,forward,1,1,1,2,3,5638,3 +448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5 +449,alias_default_53,call_function,alias.default,forward,1,1,1,2,205,5635,4 +450,convert_element_type_42,call_function,convert_element_type.default,forward,1,1,1,1,206,5623,4 +451,alias_default_54,call_function,alias.default,forward,1,1,1,2,207,5622,4 +452,neg_1,call_function,neg.default,forward,1,1,1,1,208,5621,8 +453,exp_1,call_function,exp.default,forward,1,1,1,1,209,5620,6 +454,add_8,call_function,add.Tensor,forward,1,1,1,1,210,5619,4 +455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6 +456,convert_element_type_43,call_function,convert_element_type.default,forward,1,1,1,1,212,5617,6 +457,dtype_cast_17,call_function,dtype_cast.default,forward,1,1,1,1,1,5621,3 +458,permute_20,call_function,permute.default,forward,1,1,1,1,2,5620,3 +459,alias_default_56,call_function,alias.default,forward,1,1,1,2,3,5619,3 +460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5 +461,alias_default_55,call_function,alias.default,forward,1,1,1,2,213,5616,4 +462,alias_default_57,call_function,alias.default,forward,1,1,1,2,205,5616,4 +463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8 +464,dtype_cast_18,call_function,dtype_cast.default,forward,1,1,1,1,1,5617,3 +465,permute_21,call_function,permute.default,forward,1,1,1,1,2,5616,3 +466,alias_default_58,call_function,alias.default,forward,1,1,1,2,221,5614,4 +467,alias_default_59,call_function,alias.default,forward,1,1,1,2,3,5615,3 +468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5 +469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10 +470,dtype_cast_19,call_function,dtype_cast.default,forward,2,1,1,1,1,5601,2 +471,alias_default_60,call_function,alias.default,forward,1,1,1,3,228,5611,4 +472,convert_element_type_48,call_function,convert_element_type.default,forward,2,1,1,1,229,5609,4 +473,alias_default_62,call_function,alias.default,forward,2,1,1,2,230,5608,4 +474,pow_5,call_function,pow.Tensor_Scalar,forward,2,1,1,1,231,5607,4 +475,mean_4,call_function,mean.dim,forward,2,1,1,1,232,5606,4 +476,add_10,call_function,add.Scalar,forward,2,1,1,1,233,5605,3 +477,rsqrt_4,call_function,rsqrt.default,forward,2,1,1,1,234,5604,3 +478,alias_default_63,call_function,alias.default,forward,2,1,1,3,235,5603,3 +479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8 +480,alias_default_61,call_function,alias.default,forward,2,1,1,2,2,5600,2 +481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8 +482,convert_element_type_49,call_function,convert_element_type.default,forward,2,1,1,1,241,5597,6 +483,dtype_cast_20,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3 +484,permute_22,call_function,permute.default,forward,2,1,1,1,2,5583,3 +485,alias_default_64,call_function,alias.default,forward,2,1,1,6,242,5596,4 +486,alias_default_65,call_function,alias.default,forward,2,1,1,2,3,5582,3 +487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5 +488,dtype_cast_21,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3 +489,permute_23,call_function,permute.default,forward,2,1,1,1,2,5583,3 +490,alias_default_66,call_function,alias.default,forward,2,1,1,2,3,5582,3 +491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5 +492,dtype_cast_22,call_function,dtype_cast.default,forward,2,1,1,1,1,5577,3 +493,permute_24,call_function,permute.default,forward,2,1,1,1,2,5576,3 +494,alias_default_67,call_function,alias.default,forward,2,1,1,2,3,5575,3 +495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5 +496,view_52,call_function,view.default,forward,2,1,1,1,248,5579,4 +497,view_53,call_function,view.default,forward,2,1,1,1,248,5579,4 +498,view_54,call_function,view.default,forward,2,1,1,1,248,5572,4 +499,convert_element_type_56,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4 +500,view_55,call_function,view.default,forward,2,1,1,1,250,5577,4 +501,view_as_complex_4,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6 +502,convert_element_type_57,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4 +503,view_56,call_function,view.default,forward,2,1,1,1,250,5577,4 +504,view_as_complex_5,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6 +505,view_57,call_function,view.default,forward,2,1,1,1,2,5587,3 +506,alias_default_68,call_function,alias.default,forward,2,1,1,4,3,5586,3 +507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 +508,view_as_real_4,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6 +509,view_58,call_function,view.default,forward,2,1,1,1,256,5573,6 +510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 +511,view_as_real_5,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6 +512,view_59,call_function,view.default,forward,2,1,1,1,256,5573,6 +513,convert_element_type_58,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6 +514,convert_element_type_59,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6 +515,permute_25,call_function,permute.default,forward,2,1,1,1,258,5571,6 +516,permute_26,call_function,permute.default,forward,2,1,1,1,258,5571,6 +517,permute_27,call_function,permute.default,forward,2,1,1,1,249,5571,4 +518,alias_default_69,call_function,alias.default,forward,2,1,1,2,259,5570,4 +519,alias_default_70,call_function,alias.default,forward,2,1,1,2,259,5570,4 +520,alias_default_71,call_function,alias.default,forward,2,1,1,2,250,5570,4 +521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2 +522,getitem_18,call_function,getitem,forward,2,1,1,1,284,5565,2 +523,getitem_19,call_function,getitem,forward,2,1,1,1,284,284,2 +524,getitem_24,call_function,getitem,forward,2,1,1,1,284,284,1 +525,getitem_25,call_function,getitem,forward,2,1,1,1,284,284,1 +526,alias_default_72,call_function,alias.default,forward,2,1,1,2,285,5564,4 +527,permute_28,call_function,permute.default,forward,2,1,1,1,286,5563,4 +528,view_60,call_function,view.default,forward,2,1,1,1,287,5562,3 +529,dtype_cast_23,call_function,dtype_cast.default,forward,2,1,1,1,1,5564,3 +530,permute_29,call_function,permute.default,forward,2,1,1,1,2,5563,3 +531,alias_default_73,call_function,alias.default,forward,2,1,1,2,288,5561,4 +532,alias_default_74,call_function,alias.default,forward,2,1,1,2,3,5562,3 +533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5 +534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10 +535,dtype_cast_24,call_function,dtype_cast.default,forward,2,1,1,1,1,5548,2 +536,alias_default_75,call_function,alias.default,forward,2,1,1,3,295,5558,4 +537,convert_element_type_62,call_function,convert_element_type.default,forward,2,1,1,1,296,5556,4 +538,alias_default_77,call_function,alias.default,forward,2,1,1,2,297,5555,4 +539,pow_6,call_function,pow.Tensor_Scalar,forward,2,1,1,1,298,5554,4 +540,mean_5,call_function,mean.dim,forward,2,1,1,1,299,5553,4 +541,add_12,call_function,add.Scalar,forward,2,1,1,1,300,5552,3 +542,rsqrt_5,call_function,rsqrt.default,forward,2,1,1,1,301,5551,3 +543,alias_default_78,call_function,alias.default,forward,2,1,1,3,302,5550,3 +544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8 +545,alias_default_76,call_function,alias.default,forward,2,1,1,2,2,5547,2 +546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8 +547,convert_element_type_63,call_function,convert_element_type.default,forward,2,1,1,1,308,5544,6 +548,dtype_cast_25,call_function,dtype_cast.default,forward,2,1,1,1,1,5544,3 +549,permute_30,call_function,permute.default,forward,2,1,1,1,2,5543,3 +550,alias_default_79,call_function,alias.default,forward,2,1,1,4,309,5543,4 +551,alias_default_80,call_function,alias.default,forward,2,1,1,2,3,5542,3 +552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5 +553,alias_default_81,call_function,alias.default,forward,2,1,1,2,315,5539,4 +554,convert_element_type_66,call_function,convert_element_type.default,forward,2,1,1,1,316,5527,4 +555,alias_default_82,call_function,alias.default,forward,2,1,1,2,317,5526,4 +556,neg_2,call_function,neg.default,forward,2,1,1,1,318,5525,8 +557,exp_2,call_function,exp.default,forward,2,1,1,1,319,5524,6 +558,add_13,call_function,add.Tensor,forward,2,1,1,1,320,5523,4 +559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6 +560,convert_element_type_67,call_function,convert_element_type.default,forward,2,1,1,1,322,5521,6 +561,dtype_cast_26,call_function,dtype_cast.default,forward,2,1,1,1,1,5525,3 +562,permute_31,call_function,permute.default,forward,2,1,1,1,2,5524,3 +563,alias_default_84,call_function,alias.default,forward,2,1,1,2,3,5523,3 +564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5 +565,alias_default_83,call_function,alias.default,forward,2,1,1,2,323,5520,4 +566,alias_default_85,call_function,alias.default,forward,2,1,1,2,315,5520,4 +567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8 +568,dtype_cast_27,call_function,dtype_cast.default,forward,2,1,1,1,1,5521,3 +569,permute_32,call_function,permute.default,forward,2,1,1,1,2,5520,3 +570,alias_default_86,call_function,alias.default,forward,2,1,1,2,331,5518,4 +571,alias_default_87,call_function,alias.default,forward,2,1,1,2,3,5519,3 +572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5 +573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10 +574,dtype_cast_28,call_function,dtype_cast.default,forward,3,1,1,1,1,5505,2 +575,alias_default_88,call_function,alias.default,forward,2,1,1,3,338,5515,4 +576,convert_element_type_72,call_function,convert_element_type.default,forward,3,1,1,1,339,5513,4 +577,alias_default_90,call_function,alias.default,forward,3,1,1,2,340,5512,4 +578,pow_7,call_function,pow.Tensor_Scalar,forward,3,1,1,1,341,5511,4 +579,mean_6,call_function,mean.dim,forward,3,1,1,1,342,5510,4 +580,add_15,call_function,add.Scalar,forward,3,1,1,1,343,5509,3 +581,rsqrt_6,call_function,rsqrt.default,forward,3,1,1,1,344,5508,3 +582,alias_default_91,call_function,alias.default,forward,3,1,1,3,345,5507,3 +583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8 +584,alias_default_89,call_function,alias.default,forward,3,1,1,2,2,5504,2 +585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8 +586,convert_element_type_73,call_function,convert_element_type.default,forward,3,1,1,1,351,5501,6 +587,dtype_cast_29,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3 +588,permute_33,call_function,permute.default,forward,3,1,1,1,2,5487,3 +589,alias_default_92,call_function,alias.default,forward,3,1,1,6,352,5500,4 +590,alias_default_93,call_function,alias.default,forward,3,1,1,2,3,5486,3 +591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5 +592,dtype_cast_30,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3 +593,permute_34,call_function,permute.default,forward,3,1,1,1,2,5487,3 +594,alias_default_94,call_function,alias.default,forward,3,1,1,2,3,5486,3 +595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5 +596,dtype_cast_31,call_function,dtype_cast.default,forward,3,1,1,1,1,5481,3 +597,permute_35,call_function,permute.default,forward,3,1,1,1,2,5480,3 +598,alias_default_95,call_function,alias.default,forward,3,1,1,2,3,5479,3 +599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5 +600,view_75,call_function,view.default,forward,3,1,1,1,358,5483,4 +601,view_76,call_function,view.default,forward,3,1,1,1,358,5483,4 +602,view_77,call_function,view.default,forward,3,1,1,1,358,5476,4 +603,convert_element_type_80,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4 +604,view_78,call_function,view.default,forward,3,1,1,1,360,5481,4 +605,view_as_complex_6,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6 +606,convert_element_type_81,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4 +607,view_79,call_function,view.default,forward,3,1,1,1,360,5481,4 +608,view_as_complex_7,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6 +609,view_80,call_function,view.default,forward,3,1,1,1,2,5491,3 +610,alias_default_96,call_function,alias.default,forward,3,1,1,4,3,5490,3 +611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 +612,view_as_real_6,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6 +613,view_81,call_function,view.default,forward,3,1,1,1,366,5477,6 +614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 +615,view_as_real_7,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6 +616,view_82,call_function,view.default,forward,3,1,1,1,366,5477,6 +617,convert_element_type_82,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6 +618,convert_element_type_83,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6 +619,permute_36,call_function,permute.default,forward,3,1,1,1,368,5475,6 +620,permute_37,call_function,permute.default,forward,3,1,1,1,368,5475,6 +621,permute_38,call_function,permute.default,forward,3,1,1,1,359,5475,4 +622,alias_default_97,call_function,alias.default,forward,3,1,1,2,369,5474,4 +623,alias_default_98,call_function,alias.default,forward,3,1,1,2,369,5474,4 +624,alias_default_99,call_function,alias.default,forward,3,1,1,2,360,5474,4 +625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2 +626,getitem_27,call_function,getitem,forward,3,1,1,1,394,5469,2 +627,getitem_28,call_function,getitem,forward,3,1,1,1,394,394,2 +628,getitem_33,call_function,getitem,forward,3,1,1,1,394,394,1 +629,getitem_34,call_function,getitem,forward,3,1,1,1,394,394,1 +630,alias_default_100,call_function,alias.default,forward,3,1,1,2,395,5468,4 +631,permute_39,call_function,permute.default,forward,3,1,1,1,396,5467,4 +632,view_83,call_function,view.default,forward,3,1,1,1,397,5466,3 +633,dtype_cast_32,call_function,dtype_cast.default,forward,3,1,1,1,1,5468,3 +634,permute_40,call_function,permute.default,forward,3,1,1,1,2,5467,3 +635,alias_default_101,call_function,alias.default,forward,3,1,1,2,398,5465,4 +636,alias_default_102,call_function,alias.default,forward,3,1,1,2,3,5466,3 +637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5 +638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10 +639,dtype_cast_33,call_function,dtype_cast.default,forward,3,1,1,1,1,5452,2 +640,alias_default_103,call_function,alias.default,forward,3,1,1,3,405,5462,4 +641,convert_element_type_86,call_function,convert_element_type.default,forward,3,1,1,1,406,5460,4 +642,alias_default_105,call_function,alias.default,forward,3,1,1,2,407,5459,4 +643,pow_8,call_function,pow.Tensor_Scalar,forward,3,1,1,1,408,5458,4 +644,mean_7,call_function,mean.dim,forward,3,1,1,1,409,5457,4 +645,add_17,call_function,add.Scalar,forward,3,1,1,1,410,5456,3 +646,rsqrt_7,call_function,rsqrt.default,forward,3,1,1,1,411,5455,3 +647,alias_default_106,call_function,alias.default,forward,3,1,1,3,412,5454,3 +648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8 +649,alias_default_104,call_function,alias.default,forward,3,1,1,2,2,5451,2 +650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8 +651,convert_element_type_87,call_function,convert_element_type.default,forward,3,1,1,1,418,5448,6 +652,dtype_cast_34,call_function,dtype_cast.default,forward,3,1,1,1,1,5448,3 +653,permute_41,call_function,permute.default,forward,3,1,1,1,2,5447,3 +654,alias_default_107,call_function,alias.default,forward,3,1,1,4,419,5447,4 +655,alias_default_108,call_function,alias.default,forward,3,1,1,2,3,5446,3 +656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5 +657,alias_default_109,call_function,alias.default,forward,3,1,1,2,425,5443,4 +658,convert_element_type_90,call_function,convert_element_type.default,forward,3,1,1,1,426,5431,4 +659,alias_default_110,call_function,alias.default,forward,3,1,1,2,427,5430,4 +660,neg_3,call_function,neg.default,forward,3,1,1,1,428,5429,8 +661,exp_3,call_function,exp.default,forward,3,1,1,1,429,5428,6 +662,add_18,call_function,add.Tensor,forward,3,1,1,1,430,5427,4 +663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6 +664,convert_element_type_91,call_function,convert_element_type.default,forward,3,1,1,1,432,5425,6 +665,dtype_cast_35,call_function,dtype_cast.default,forward,3,1,1,1,1,5429,3 +666,permute_42,call_function,permute.default,forward,3,1,1,1,2,5428,3 +667,alias_default_112,call_function,alias.default,forward,3,1,1,2,3,5427,3 +668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5 +669,alias_default_111,call_function,alias.default,forward,3,1,1,2,433,5424,4 +670,alias_default_113,call_function,alias.default,forward,3,1,1,2,425,5424,4 +671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8 +672,dtype_cast_36,call_function,dtype_cast.default,forward,3,1,1,1,1,5425,3 +673,permute_43,call_function,permute.default,forward,3,1,1,1,2,5424,3 +674,alias_default_114,call_function,alias.default,forward,3,1,1,2,441,5422,4 +675,alias_default_115,call_function,alias.default,forward,3,1,1,2,3,5423,3 +676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5 +677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10 +678,dtype_cast_37,call_function,dtype_cast.default,forward,4,1,1,1,1,5409,2 +679,alias_default_116,call_function,alias.default,forward,3,1,1,3,448,5419,4 +680,convert_element_type_96,call_function,convert_element_type.default,forward,4,1,1,1,449,5417,4 +681,alias_default_118,call_function,alias.default,forward,4,1,1,2,450,5416,4 +682,pow_9,call_function,pow.Tensor_Scalar,forward,4,1,1,1,451,5415,4 +683,mean_8,call_function,mean.dim,forward,4,1,1,1,452,5414,4 +684,add_20,call_function,add.Scalar,forward,4,1,1,1,453,5413,3 +685,rsqrt_8,call_function,rsqrt.default,forward,4,1,1,1,454,5412,3 +686,alias_default_119,call_function,alias.default,forward,4,1,1,3,455,5411,3 +687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8 +688,alias_default_117,call_function,alias.default,forward,4,1,1,2,2,5408,2 +689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8 +690,convert_element_type_97,call_function,convert_element_type.default,forward,4,1,1,1,461,5405,6 +691,dtype_cast_38,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3 +692,permute_44,call_function,permute.default,forward,4,1,1,1,2,5391,3 +693,alias_default_120,call_function,alias.default,forward,4,1,1,6,462,5404,4 +694,alias_default_121,call_function,alias.default,forward,4,1,1,2,3,5390,3 +695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5 +696,dtype_cast_39,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3 +697,permute_45,call_function,permute.default,forward,4,1,1,1,2,5391,3 +698,alias_default_122,call_function,alias.default,forward,4,1,1,2,3,5390,3 +699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5 +700,dtype_cast_40,call_function,dtype_cast.default,forward,4,1,1,1,1,5385,3 +701,permute_46,call_function,permute.default,forward,4,1,1,1,2,5384,3 +702,alias_default_123,call_function,alias.default,forward,4,1,1,2,3,5383,3 +703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5 +704,view_98,call_function,view.default,forward,4,1,1,1,468,5387,4 +705,view_99,call_function,view.default,forward,4,1,1,1,468,5387,4 +706,view_100,call_function,view.default,forward,4,1,1,1,468,5380,4 +707,convert_element_type_104,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4 +708,view_101,call_function,view.default,forward,4,1,1,1,470,5385,4 +709,view_as_complex_8,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6 +710,convert_element_type_105,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4 +711,view_102,call_function,view.default,forward,4,1,1,1,470,5385,4 +712,view_as_complex_9,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6 +713,view_103,call_function,view.default,forward,4,1,1,1,2,5395,3 +714,alias_default_124,call_function,alias.default,forward,4,1,1,4,3,5394,3 +715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 +716,view_as_real_8,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6 +717,view_104,call_function,view.default,forward,4,1,1,1,476,5381,6 +718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 +719,view_as_real_9,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6 +720,view_105,call_function,view.default,forward,4,1,1,1,476,5381,6 +721,convert_element_type_106,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6 +722,convert_element_type_107,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6 +723,permute_47,call_function,permute.default,forward,4,1,1,1,478,5379,6 +724,permute_48,call_function,permute.default,forward,4,1,1,1,478,5379,6 +725,permute_49,call_function,permute.default,forward,4,1,1,1,469,5379,4 +726,alias_default_125,call_function,alias.default,forward,4,1,1,2,479,5378,4 +727,alias_default_126,call_function,alias.default,forward,4,1,1,2,479,5378,4 +728,alias_default_127,call_function,alias.default,forward,4,1,1,2,470,5378,4 +729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2 +730,getitem_36,call_function,getitem,forward,4,1,1,1,504,5373,2 +731,getitem_37,call_function,getitem,forward,4,1,1,1,504,504,2 +732,getitem_42,call_function,getitem,forward,4,1,1,1,504,504,1 +733,getitem_43,call_function,getitem,forward,4,1,1,1,504,504,1 +734,alias_default_128,call_function,alias.default,forward,4,1,1,2,505,5372,4 +735,permute_50,call_function,permute.default,forward,4,1,1,1,506,5371,4 +736,view_106,call_function,view.default,forward,4,1,1,1,507,5370,3 +737,dtype_cast_41,call_function,dtype_cast.default,forward,4,1,1,1,1,5372,3 +738,permute_51,call_function,permute.default,forward,4,1,1,1,2,5371,3 +739,alias_default_129,call_function,alias.default,forward,4,1,1,2,508,5369,4 +740,alias_default_130,call_function,alias.default,forward,4,1,1,2,3,5370,3 +741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5 +742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10 +743,dtype_cast_42,call_function,dtype_cast.default,forward,4,1,1,1,1,5356,2 +744,alias_default_131,call_function,alias.default,forward,4,1,1,3,515,5366,4 +745,convert_element_type_110,call_function,convert_element_type.default,forward,4,1,1,1,516,5364,4 +746,alias_default_133,call_function,alias.default,forward,4,1,1,2,517,5363,4 +747,pow_10,call_function,pow.Tensor_Scalar,forward,4,1,1,1,518,5362,4 +748,mean_9,call_function,mean.dim,forward,4,1,1,1,519,5361,4 +749,add_22,call_function,add.Scalar,forward,4,1,1,1,520,5360,3 +750,rsqrt_9,call_function,rsqrt.default,forward,4,1,1,1,521,5359,3 +751,alias_default_134,call_function,alias.default,forward,4,1,1,3,522,5358,3 +752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8 +753,alias_default_132,call_function,alias.default,forward,4,1,1,2,2,5355,2 +754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8 +755,convert_element_type_111,call_function,convert_element_type.default,forward,4,1,1,1,528,5352,6 +756,dtype_cast_43,call_function,dtype_cast.default,forward,4,1,1,1,1,5352,3 +757,permute_52,call_function,permute.default,forward,4,1,1,1,2,5351,3 +758,alias_default_135,call_function,alias.default,forward,4,1,1,4,529,5351,4 +759,alias_default_136,call_function,alias.default,forward,4,1,1,2,3,5350,3 +760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5 +761,alias_default_137,call_function,alias.default,forward,4,1,1,2,535,5347,4 +762,convert_element_type_114,call_function,convert_element_type.default,forward,4,1,1,1,536,5335,4 +763,alias_default_138,call_function,alias.default,forward,4,1,1,2,537,5334,4 +764,neg_4,call_function,neg.default,forward,4,1,1,1,538,5333,8 +765,exp_4,call_function,exp.default,forward,4,1,1,1,539,5332,6 +766,add_23,call_function,add.Tensor,forward,4,1,1,1,540,5331,4 +767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6 +768,convert_element_type_115,call_function,convert_element_type.default,forward,4,1,1,1,542,5329,6 +769,dtype_cast_44,call_function,dtype_cast.default,forward,4,1,1,1,1,5333,3 +770,permute_53,call_function,permute.default,forward,4,1,1,1,2,5332,3 +771,alias_default_140,call_function,alias.default,forward,4,1,1,2,3,5331,3 +772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5 +773,alias_default_139,call_function,alias.default,forward,4,1,1,2,543,5328,4 +774,alias_default_141,call_function,alias.default,forward,4,1,1,2,535,5328,4 +775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8 +776,dtype_cast_45,call_function,dtype_cast.default,forward,4,1,1,1,1,5329,3 +777,permute_54,call_function,permute.default,forward,4,1,1,1,2,5328,3 +778,alias_default_142,call_function,alias.default,forward,4,1,1,2,551,5326,4 +779,alias_default_143,call_function,alias.default,forward,4,1,1,2,3,5327,3 +780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5 +781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10 +782,dtype_cast_46,call_function,dtype_cast.default,forward,5,1,1,1,1,5313,2 +783,alias_default_144,call_function,alias.default,forward,4,1,1,3,558,5323,4 +784,convert_element_type_120,call_function,convert_element_type.default,forward,5,1,1,1,559,5321,4 +785,alias_default_146,call_function,alias.default,forward,5,1,1,2,560,5320,4 +786,pow_11,call_function,pow.Tensor_Scalar,forward,5,1,1,1,561,5319,4 +787,mean_10,call_function,mean.dim,forward,5,1,1,1,562,5318,4 +788,add_25,call_function,add.Scalar,forward,5,1,1,1,563,5317,3 +789,rsqrt_10,call_function,rsqrt.default,forward,5,1,1,1,564,5316,3 +790,alias_default_147,call_function,alias.default,forward,5,1,1,3,565,5315,3 +791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8 +792,alias_default_145,call_function,alias.default,forward,5,1,1,2,2,5312,2 +793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8 +794,convert_element_type_121,call_function,convert_element_type.default,forward,5,1,1,1,571,5309,6 +795,dtype_cast_47,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3 +796,permute_55,call_function,permute.default,forward,5,1,1,1,2,5295,3 +797,alias_default_148,call_function,alias.default,forward,5,1,1,6,572,5308,4 +798,alias_default_149,call_function,alias.default,forward,5,1,1,2,3,5294,3 +799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5 +800,dtype_cast_48,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3 +801,permute_56,call_function,permute.default,forward,5,1,1,1,2,5295,3 +802,alias_default_150,call_function,alias.default,forward,5,1,1,2,3,5294,3 +803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5 +804,dtype_cast_49,call_function,dtype_cast.default,forward,5,1,1,1,1,5289,3 +805,permute_57,call_function,permute.default,forward,5,1,1,1,2,5288,3 +806,alias_default_151,call_function,alias.default,forward,5,1,1,2,3,5287,3 +807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5 +808,view_121,call_function,view.default,forward,5,1,1,1,578,5291,4 +809,view_122,call_function,view.default,forward,5,1,1,1,578,5291,4 +810,view_123,call_function,view.default,forward,5,1,1,1,578,5284,4 +811,convert_element_type_128,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4 +812,view_124,call_function,view.default,forward,5,1,1,1,580,5289,4 +813,view_as_complex_10,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6 +814,convert_element_type_129,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4 +815,view_125,call_function,view.default,forward,5,1,1,1,580,5289,4 +816,view_as_complex_11,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6 +817,view_126,call_function,view.default,forward,5,1,1,1,2,5299,3 +818,alias_default_152,call_function,alias.default,forward,5,1,1,4,3,5298,3 +819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 +820,view_as_real_10,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6 +821,view_127,call_function,view.default,forward,5,1,1,1,586,5285,6 +822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 +823,view_as_real_11,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6 +824,view_128,call_function,view.default,forward,5,1,1,1,586,5285,6 +825,convert_element_type_130,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6 +826,convert_element_type_131,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6 +827,permute_58,call_function,permute.default,forward,5,1,1,1,588,5283,6 +828,permute_59,call_function,permute.default,forward,5,1,1,1,588,5283,6 +829,permute_60,call_function,permute.default,forward,5,1,1,1,579,5283,4 +830,alias_default_153,call_function,alias.default,forward,5,1,1,2,589,5282,4 +831,alias_default_154,call_function,alias.default,forward,5,1,1,2,589,5282,4 +832,alias_default_155,call_function,alias.default,forward,5,1,1,2,580,5282,4 +833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2 +834,getitem_45,call_function,getitem,forward,5,1,1,1,614,5277,2 +835,getitem_46,call_function,getitem,forward,5,1,1,1,614,614,2 +836,getitem_51,call_function,getitem,forward,5,1,1,1,614,614,1 +837,getitem_52,call_function,getitem,forward,5,1,1,1,614,614,1 +838,alias_default_156,call_function,alias.default,forward,5,1,1,2,615,5276,4 +839,permute_61,call_function,permute.default,forward,5,1,1,1,616,5275,4 +840,view_129,call_function,view.default,forward,5,1,1,1,617,5274,3 +841,dtype_cast_50,call_function,dtype_cast.default,forward,5,1,1,1,1,5276,3 +842,permute_62,call_function,permute.default,forward,5,1,1,1,2,5275,3 +843,alias_default_157,call_function,alias.default,forward,5,1,1,2,618,5273,4 +844,alias_default_158,call_function,alias.default,forward,5,1,1,2,3,5274,3 +845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5 +846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10 +847,dtype_cast_51,call_function,dtype_cast.default,forward,5,1,1,1,1,5260,2 +848,alias_default_159,call_function,alias.default,forward,5,1,1,3,625,5270,4 +849,convert_element_type_134,call_function,convert_element_type.default,forward,5,1,1,1,626,5268,4 +850,alias_default_161,call_function,alias.default,forward,5,1,1,2,627,5267,4 +851,pow_12,call_function,pow.Tensor_Scalar,forward,5,1,1,1,628,5266,4 +852,mean_11,call_function,mean.dim,forward,5,1,1,1,629,5265,4 +853,add_27,call_function,add.Scalar,forward,5,1,1,1,630,5264,3 +854,rsqrt_11,call_function,rsqrt.default,forward,5,1,1,1,631,5263,3 +855,alias_default_162,call_function,alias.default,forward,5,1,1,3,632,5262,3 +856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8 +857,alias_default_160,call_function,alias.default,forward,5,1,1,2,2,5259,2 +858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8 +859,convert_element_type_135,call_function,convert_element_type.default,forward,5,1,1,1,638,5256,6 +860,dtype_cast_52,call_function,dtype_cast.default,forward,5,1,1,1,1,5256,3 +861,permute_63,call_function,permute.default,forward,5,1,1,1,2,5255,3 +862,alias_default_163,call_function,alias.default,forward,5,1,1,4,639,5255,4 +863,alias_default_164,call_function,alias.default,forward,5,1,1,2,3,5254,3 +864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5 +865,alias_default_165,call_function,alias.default,forward,5,1,1,2,645,5251,4 +866,convert_element_type_138,call_function,convert_element_type.default,forward,5,1,1,1,646,5239,4 +867,alias_default_166,call_function,alias.default,forward,5,1,1,2,647,5238,4 +868,neg_5,call_function,neg.default,forward,5,1,1,1,648,5237,8 +869,exp_5,call_function,exp.default,forward,5,1,1,1,649,5236,6 +870,add_28,call_function,add.Tensor,forward,5,1,1,1,650,5235,4 +871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6 +872,convert_element_type_139,call_function,convert_element_type.default,forward,5,1,1,1,652,5233,6 +873,dtype_cast_53,call_function,dtype_cast.default,forward,5,1,1,1,1,5237,3 +874,permute_64,call_function,permute.default,forward,5,1,1,1,2,5236,3 +875,alias_default_168,call_function,alias.default,forward,5,1,1,2,3,5235,3 +876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5 +877,alias_default_167,call_function,alias.default,forward,5,1,1,2,653,5232,4 +878,alias_default_169,call_function,alias.default,forward,5,1,1,2,645,5232,4 +879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8 +880,dtype_cast_54,call_function,dtype_cast.default,forward,5,1,1,1,1,5233,3 +881,permute_65,call_function,permute.default,forward,5,1,1,1,2,5232,3 +882,alias_default_170,call_function,alias.default,forward,5,1,1,2,661,5230,4 +883,alias_default_171,call_function,alias.default,forward,5,1,1,2,3,5231,3 +884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5 +885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10 +886,dtype_cast_55,call_function,dtype_cast.default,forward,6,1,1,1,1,5217,2 +887,alias_default_172,call_function,alias.default,forward,5,1,1,3,668,5227,4 +888,convert_element_type_144,call_function,convert_element_type.default,forward,6,1,1,1,669,5225,4 +889,alias_default_174,call_function,alias.default,forward,6,1,1,2,670,5224,4 +890,pow_13,call_function,pow.Tensor_Scalar,forward,6,1,1,1,671,5223,4 +891,mean_12,call_function,mean.dim,forward,6,1,1,1,672,5222,4 +892,add_30,call_function,add.Scalar,forward,6,1,1,1,673,5221,3 +893,rsqrt_12,call_function,rsqrt.default,forward,6,1,1,1,674,5220,3 +894,alias_default_175,call_function,alias.default,forward,6,1,1,3,675,5219,3 +895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8 +896,alias_default_173,call_function,alias.default,forward,6,1,1,2,2,5216,2 +897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8 +898,convert_element_type_145,call_function,convert_element_type.default,forward,6,1,1,1,681,5213,6 +899,dtype_cast_56,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3 +900,permute_66,call_function,permute.default,forward,6,1,1,1,2,5199,3 +901,alias_default_176,call_function,alias.default,forward,6,1,1,6,682,5212,4 +902,alias_default_177,call_function,alias.default,forward,6,1,1,2,3,5198,3 +903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5 +904,dtype_cast_57,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3 +905,permute_67,call_function,permute.default,forward,6,1,1,1,2,5199,3 +906,alias_default_178,call_function,alias.default,forward,6,1,1,2,3,5198,3 +907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5 +908,dtype_cast_58,call_function,dtype_cast.default,forward,6,1,1,1,1,5193,3 +909,permute_68,call_function,permute.default,forward,6,1,1,1,2,5192,3 +910,alias_default_179,call_function,alias.default,forward,6,1,1,2,3,5191,3 +911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5 +912,view_144,call_function,view.default,forward,6,1,1,1,688,5195,4 +913,view_145,call_function,view.default,forward,6,1,1,1,688,5195,4 +914,view_146,call_function,view.default,forward,6,1,1,1,688,5188,4 +915,convert_element_type_152,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4 +916,view_147,call_function,view.default,forward,6,1,1,1,690,5193,4 +917,view_as_complex_12,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6 +918,convert_element_type_153,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4 +919,view_148,call_function,view.default,forward,6,1,1,1,690,5193,4 +920,view_as_complex_13,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6 +921,view_149,call_function,view.default,forward,6,1,1,1,2,5203,3 +922,alias_default_180,call_function,alias.default,forward,6,1,1,4,3,5202,3 +923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 +924,view_as_real_12,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6 +925,view_150,call_function,view.default,forward,6,1,1,1,696,5189,6 +926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 +927,view_as_real_13,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6 +928,view_151,call_function,view.default,forward,6,1,1,1,696,5189,6 +929,convert_element_type_154,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6 +930,convert_element_type_155,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6 +931,permute_69,call_function,permute.default,forward,6,1,1,1,698,5187,6 +932,permute_70,call_function,permute.default,forward,6,1,1,1,698,5187,6 +933,permute_71,call_function,permute.default,forward,6,1,1,1,689,5187,4 +934,alias_default_181,call_function,alias.default,forward,6,1,1,2,699,5186,4 +935,alias_default_182,call_function,alias.default,forward,6,1,1,2,699,5186,4 +936,alias_default_183,call_function,alias.default,forward,6,1,1,2,690,5186,4 +937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2 +938,getitem_54,call_function,getitem,forward,6,1,1,1,724,5181,2 +939,getitem_55,call_function,getitem,forward,6,1,1,1,724,724,2 +940,getitem_60,call_function,getitem,forward,6,1,1,1,724,724,1 +941,getitem_61,call_function,getitem,forward,6,1,1,1,724,724,1 +942,alias_default_184,call_function,alias.default,forward,6,1,1,2,725,5180,4 +943,permute_72,call_function,permute.default,forward,6,1,1,1,726,5179,4 +944,view_152,call_function,view.default,forward,6,1,1,1,727,5178,3 +945,dtype_cast_59,call_function,dtype_cast.default,forward,6,1,1,1,1,5180,3 +946,permute_73,call_function,permute.default,forward,6,1,1,1,2,5179,3 +947,alias_default_185,call_function,alias.default,forward,6,1,1,2,728,5177,4 +948,alias_default_186,call_function,alias.default,forward,6,1,1,2,3,5178,3 +949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5 +950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10 +951,dtype_cast_60,call_function,dtype_cast.default,forward,6,1,1,1,1,5164,2 +952,alias_default_187,call_function,alias.default,forward,6,1,1,3,735,5174,4 +953,convert_element_type_158,call_function,convert_element_type.default,forward,6,1,1,1,736,5172,4 +954,alias_default_189,call_function,alias.default,forward,6,1,1,2,737,5171,4 +955,pow_14,call_function,pow.Tensor_Scalar,forward,6,1,1,1,738,5170,4 +956,mean_13,call_function,mean.dim,forward,6,1,1,1,739,5169,4 +957,add_32,call_function,add.Scalar,forward,6,1,1,1,740,5168,3 +958,rsqrt_13,call_function,rsqrt.default,forward,6,1,1,1,741,5167,3 +959,alias_default_190,call_function,alias.default,forward,6,1,1,3,742,5166,3 +960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8 +961,alias_default_188,call_function,alias.default,forward,6,1,1,2,2,5163,2 +962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8 +963,convert_element_type_159,call_function,convert_element_type.default,forward,6,1,1,1,748,5160,6 +964,dtype_cast_61,call_function,dtype_cast.default,forward,6,1,1,1,1,5160,3 +965,permute_74,call_function,permute.default,forward,6,1,1,1,2,5159,3 +966,alias_default_191,call_function,alias.default,forward,6,1,1,4,749,5159,4 +967,alias_default_192,call_function,alias.default,forward,6,1,1,2,3,5158,3 +968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5 +969,alias_default_193,call_function,alias.default,forward,6,1,1,2,755,5155,4 +970,convert_element_type_162,call_function,convert_element_type.default,forward,6,1,1,1,756,5143,4 +971,alias_default_194,call_function,alias.default,forward,6,1,1,2,757,5142,4 +972,neg_6,call_function,neg.default,forward,6,1,1,1,758,5141,8 +973,exp_6,call_function,exp.default,forward,6,1,1,1,759,5140,6 +974,add_33,call_function,add.Tensor,forward,6,1,1,1,760,5139,4 +975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6 +976,convert_element_type_163,call_function,convert_element_type.default,forward,6,1,1,1,762,5137,6 +977,dtype_cast_62,call_function,dtype_cast.default,forward,6,1,1,1,1,5141,3 +978,permute_75,call_function,permute.default,forward,6,1,1,1,2,5140,3 +979,alias_default_196,call_function,alias.default,forward,6,1,1,2,3,5139,3 +980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5 +981,alias_default_195,call_function,alias.default,forward,6,1,1,2,763,5136,4 +982,alias_default_197,call_function,alias.default,forward,6,1,1,2,755,5136,4 +983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8 +984,dtype_cast_63,call_function,dtype_cast.default,forward,6,1,1,1,1,5137,3 +985,permute_76,call_function,permute.default,forward,6,1,1,1,2,5136,3 +986,alias_default_198,call_function,alias.default,forward,6,1,1,2,771,5134,4 +987,alias_default_199,call_function,alias.default,forward,6,1,1,2,3,5135,3 +988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5 +989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10 +990,dtype_cast_64,call_function,dtype_cast.default,forward,7,1,1,1,1,5121,2 +991,alias_default_200,call_function,alias.default,forward,6,1,1,3,778,5131,4 +992,convert_element_type_168,call_function,convert_element_type.default,forward,7,1,1,1,779,5129,4 +993,alias_default_202,call_function,alias.default,forward,7,1,1,2,780,5128,4 +994,pow_15,call_function,pow.Tensor_Scalar,forward,7,1,1,1,781,5127,4 +995,mean_14,call_function,mean.dim,forward,7,1,1,1,782,5126,4 +996,add_35,call_function,add.Scalar,forward,7,1,1,1,783,5125,3 +997,rsqrt_14,call_function,rsqrt.default,forward,7,1,1,1,784,5124,3 +998,alias_default_203,call_function,alias.default,forward,7,1,1,3,785,5123,3 +999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8 +1000,alias_default_201,call_function,alias.default,forward,7,1,1,2,2,5120,2 +1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8 +1002,convert_element_type_169,call_function,convert_element_type.default,forward,7,1,1,1,791,5117,6 +1003,dtype_cast_65,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3 +1004,permute_77,call_function,permute.default,forward,7,1,1,1,2,5103,3 +1005,alias_default_204,call_function,alias.default,forward,7,1,1,6,792,5116,4 +1006,alias_default_205,call_function,alias.default,forward,7,1,1,2,3,5102,3 +1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5 +1008,dtype_cast_66,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3 +1009,permute_78,call_function,permute.default,forward,7,1,1,1,2,5103,3 +1010,alias_default_206,call_function,alias.default,forward,7,1,1,2,3,5102,3 +1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5 +1012,dtype_cast_67,call_function,dtype_cast.default,forward,7,1,1,1,1,5097,3 +1013,permute_79,call_function,permute.default,forward,7,1,1,1,2,5096,3 +1014,alias_default_207,call_function,alias.default,forward,7,1,1,2,3,5095,3 +1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5 +1016,view_167,call_function,view.default,forward,7,1,1,1,798,5099,4 +1017,view_168,call_function,view.default,forward,7,1,1,1,798,5099,4 +1018,view_169,call_function,view.default,forward,7,1,1,1,798,5092,4 +1019,convert_element_type_176,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4 +1020,view_170,call_function,view.default,forward,7,1,1,1,800,5097,4 +1021,view_as_complex_14,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6 +1022,convert_element_type_177,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4 +1023,view_171,call_function,view.default,forward,7,1,1,1,800,5097,4 +1024,view_as_complex_15,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6 +1025,view_172,call_function,view.default,forward,7,1,1,1,2,5107,3 +1026,alias_default_208,call_function,alias.default,forward,7,1,1,4,3,5106,3 +1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 +1028,view_as_real_14,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6 +1029,view_173,call_function,view.default,forward,7,1,1,1,806,5093,6 +1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 +1031,view_as_real_15,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6 +1032,view_174,call_function,view.default,forward,7,1,1,1,806,5093,6 +1033,convert_element_type_178,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6 +1034,convert_element_type_179,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6 +1035,permute_80,call_function,permute.default,forward,7,1,1,1,808,5091,6 +1036,permute_81,call_function,permute.default,forward,7,1,1,1,808,5091,6 +1037,permute_82,call_function,permute.default,forward,7,1,1,1,799,5091,4 +1038,alias_default_209,call_function,alias.default,forward,7,1,1,2,809,5090,4 +1039,alias_default_210,call_function,alias.default,forward,7,1,1,2,809,5090,4 +1040,alias_default_211,call_function,alias.default,forward,7,1,1,2,800,5090,4 +1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2 +1042,getitem_63,call_function,getitem,forward,7,1,1,1,834,5085,2 +1043,getitem_64,call_function,getitem,forward,7,1,1,1,834,834,2 +1044,getitem_69,call_function,getitem,forward,7,1,1,1,834,834,1 +1045,getitem_70,call_function,getitem,forward,7,1,1,1,834,834,1 +1046,alias_default_212,call_function,alias.default,forward,7,1,1,2,835,5084,4 +1047,permute_83,call_function,permute.default,forward,7,1,1,1,836,5083,4 +1048,view_175,call_function,view.default,forward,7,1,1,1,837,5082,3 +1049,dtype_cast_68,call_function,dtype_cast.default,forward,7,1,1,1,1,5084,3 +1050,permute_84,call_function,permute.default,forward,7,1,1,1,2,5083,3 +1051,alias_default_213,call_function,alias.default,forward,7,1,1,2,838,5081,4 +1052,alias_default_214,call_function,alias.default,forward,7,1,1,2,3,5082,3 +1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5 +1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10 +1055,dtype_cast_69,call_function,dtype_cast.default,forward,7,1,1,1,1,5068,2 +1056,alias_default_215,call_function,alias.default,forward,7,1,1,3,845,5078,4 +1057,convert_element_type_182,call_function,convert_element_type.default,forward,7,1,1,1,846,5076,4 +1058,alias_default_217,call_function,alias.default,forward,7,1,1,2,847,5075,4 +1059,pow_16,call_function,pow.Tensor_Scalar,forward,7,1,1,1,848,5074,4 +1060,mean_15,call_function,mean.dim,forward,7,1,1,1,849,5073,4 +1061,add_37,call_function,add.Scalar,forward,7,1,1,1,850,5072,3 +1062,rsqrt_15,call_function,rsqrt.default,forward,7,1,1,1,851,5071,3 +1063,alias_default_218,call_function,alias.default,forward,7,1,1,3,852,5070,3 +1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8 +1065,alias_default_216,call_function,alias.default,forward,7,1,1,2,2,5067,2 +1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8 +1067,convert_element_type_183,call_function,convert_element_type.default,forward,7,1,1,1,858,5064,6 +1068,dtype_cast_70,call_function,dtype_cast.default,forward,7,1,1,1,1,5064,3 +1069,permute_85,call_function,permute.default,forward,7,1,1,1,2,5063,3 +1070,alias_default_219,call_function,alias.default,forward,7,1,1,4,859,5063,4 +1071,alias_default_220,call_function,alias.default,forward,7,1,1,2,3,5062,3 +1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5 +1073,alias_default_221,call_function,alias.default,forward,7,1,1,2,865,5059,4 +1074,convert_element_type_186,call_function,convert_element_type.default,forward,7,1,1,1,866,5047,4 +1075,alias_default_222,call_function,alias.default,forward,7,1,1,2,867,5046,4 +1076,neg_7,call_function,neg.default,forward,7,1,1,1,868,5045,8 +1077,exp_7,call_function,exp.default,forward,7,1,1,1,869,5044,6 +1078,add_38,call_function,add.Tensor,forward,7,1,1,1,870,5043,4 +1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6 +1080,convert_element_type_187,call_function,convert_element_type.default,forward,7,1,1,1,872,5041,6 +1081,dtype_cast_71,call_function,dtype_cast.default,forward,7,1,1,1,1,5045,3 +1082,permute_86,call_function,permute.default,forward,7,1,1,1,2,5044,3 +1083,alias_default_224,call_function,alias.default,forward,7,1,1,2,3,5043,3 +1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5 +1085,alias_default_223,call_function,alias.default,forward,7,1,1,2,873,5040,4 +1086,alias_default_225,call_function,alias.default,forward,7,1,1,2,865,5040,4 +1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8 +1088,dtype_cast_72,call_function,dtype_cast.default,forward,7,1,1,1,1,5041,3 +1089,permute_87,call_function,permute.default,forward,7,1,1,1,2,5040,3 +1090,alias_default_226,call_function,alias.default,forward,7,1,1,2,881,5038,4 +1091,alias_default_227,call_function,alias.default,forward,7,1,1,2,3,5039,3 +1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5 +1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10 +1094,dtype_cast_73,call_function,dtype_cast.default,forward,8,1,1,1,1,5025,2 +1095,alias_default_228,call_function,alias.default,forward,7,1,1,3,888,5035,4 +1096,convert_element_type_192,call_function,convert_element_type.default,forward,8,1,1,1,889,5033,4 +1097,alias_default_230,call_function,alias.default,forward,8,1,1,2,890,5032,4 +1098,pow_17,call_function,pow.Tensor_Scalar,forward,8,1,1,1,891,5031,4 +1099,mean_16,call_function,mean.dim,forward,8,1,1,1,892,5030,4 +1100,add_40,call_function,add.Scalar,forward,8,1,1,1,893,5029,3 +1101,rsqrt_16,call_function,rsqrt.default,forward,8,1,1,1,894,5028,3 +1102,alias_default_231,call_function,alias.default,forward,8,1,1,3,895,5027,3 +1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8 +1104,alias_default_229,call_function,alias.default,forward,8,1,1,2,2,5024,2 +1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8 +1106,convert_element_type_193,call_function,convert_element_type.default,forward,8,1,1,1,901,5021,6 +1107,dtype_cast_74,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3 +1108,permute_88,call_function,permute.default,forward,8,1,1,1,2,5007,3 +1109,alias_default_232,call_function,alias.default,forward,8,1,1,6,902,5020,4 +1110,alias_default_233,call_function,alias.default,forward,8,1,1,2,3,5006,3 +1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5 +1112,dtype_cast_75,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3 +1113,permute_89,call_function,permute.default,forward,8,1,1,1,2,5007,3 +1114,alias_default_234,call_function,alias.default,forward,8,1,1,2,3,5006,3 +1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5 +1116,dtype_cast_76,call_function,dtype_cast.default,forward,8,1,1,1,1,5001,3 +1117,permute_90,call_function,permute.default,forward,8,1,1,1,2,5000,3 +1118,alias_default_235,call_function,alias.default,forward,8,1,1,2,3,4999,3 +1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5 +1120,view_190,call_function,view.default,forward,8,1,1,1,908,5003,4 +1121,view_191,call_function,view.default,forward,8,1,1,1,908,5003,4 +1122,view_192,call_function,view.default,forward,8,1,1,1,908,4996,4 +1123,convert_element_type_200,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4 +1124,view_193,call_function,view.default,forward,8,1,1,1,910,5001,4 +1125,view_as_complex_16,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6 +1126,convert_element_type_201,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4 +1127,view_194,call_function,view.default,forward,8,1,1,1,910,5001,4 +1128,view_as_complex_17,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6 +1129,view_195,call_function,view.default,forward,8,1,1,1,2,5011,3 +1130,alias_default_236,call_function,alias.default,forward,8,1,1,4,3,5010,3 +1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 +1132,view_as_real_16,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6 +1133,view_196,call_function,view.default,forward,8,1,1,1,916,4997,6 +1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 +1135,view_as_real_17,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6 +1136,view_197,call_function,view.default,forward,8,1,1,1,916,4997,6 +1137,convert_element_type_202,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6 +1138,convert_element_type_203,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6 +1139,permute_91,call_function,permute.default,forward,8,1,1,1,918,4995,6 +1140,permute_92,call_function,permute.default,forward,8,1,1,1,918,4995,6 +1141,permute_93,call_function,permute.default,forward,8,1,1,1,909,4995,4 +1142,alias_default_237,call_function,alias.default,forward,8,1,1,2,919,4994,4 +1143,alias_default_238,call_function,alias.default,forward,8,1,1,2,919,4994,4 +1144,alias_default_239,call_function,alias.default,forward,8,1,1,2,910,4994,4 +1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2 +1146,getitem_72,call_function,getitem,forward,8,1,1,1,944,4989,2 +1147,getitem_73,call_function,getitem,forward,8,1,1,1,944,944,2 +1148,getitem_78,call_function,getitem,forward,8,1,1,1,944,944,1 +1149,getitem_79,call_function,getitem,forward,8,1,1,1,944,944,1 +1150,alias_default_240,call_function,alias.default,forward,8,1,1,2,945,4988,4 +1151,permute_94,call_function,permute.default,forward,8,1,1,1,946,4987,4 +1152,view_198,call_function,view.default,forward,8,1,1,1,947,4986,3 +1153,dtype_cast_77,call_function,dtype_cast.default,forward,8,1,1,1,1,4988,3 +1154,permute_95,call_function,permute.default,forward,8,1,1,1,2,4987,3 +1155,alias_default_241,call_function,alias.default,forward,8,1,1,2,948,4985,4 +1156,alias_default_242,call_function,alias.default,forward,8,1,1,2,3,4986,3 +1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5 +1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10 +1159,dtype_cast_78,call_function,dtype_cast.default,forward,8,1,1,1,1,4972,2 +1160,alias_default_243,call_function,alias.default,forward,8,1,1,3,955,4982,4 +1161,convert_element_type_206,call_function,convert_element_type.default,forward,8,1,1,1,956,4980,4 +1162,alias_default_245,call_function,alias.default,forward,8,1,1,2,957,4979,4 +1163,pow_18,call_function,pow.Tensor_Scalar,forward,8,1,1,1,958,4978,4 +1164,mean_17,call_function,mean.dim,forward,8,1,1,1,959,4977,4 +1165,add_42,call_function,add.Scalar,forward,8,1,1,1,960,4976,3 +1166,rsqrt_17,call_function,rsqrt.default,forward,8,1,1,1,961,4975,3 +1167,alias_default_246,call_function,alias.default,forward,8,1,1,3,962,4974,3 +1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8 +1169,alias_default_244,call_function,alias.default,forward,8,1,1,2,2,4971,2 +1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8 +1171,convert_element_type_207,call_function,convert_element_type.default,forward,8,1,1,1,968,4968,6 +1172,dtype_cast_79,call_function,dtype_cast.default,forward,8,1,1,1,1,4968,3 +1173,permute_96,call_function,permute.default,forward,8,1,1,1,2,4967,3 +1174,alias_default_247,call_function,alias.default,forward,8,1,1,4,969,4967,4 +1175,alias_default_248,call_function,alias.default,forward,8,1,1,2,3,4966,3 +1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5 +1177,alias_default_249,call_function,alias.default,forward,8,1,1,2,975,4963,4 +1178,convert_element_type_210,call_function,convert_element_type.default,forward,8,1,1,1,976,4951,4 +1179,alias_default_250,call_function,alias.default,forward,8,1,1,2,977,4950,4 +1180,neg_8,call_function,neg.default,forward,8,1,1,1,978,4949,8 +1181,exp_8,call_function,exp.default,forward,8,1,1,1,979,4948,6 +1182,add_43,call_function,add.Tensor,forward,8,1,1,1,980,4947,4 +1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6 +1184,convert_element_type_211,call_function,convert_element_type.default,forward,8,1,1,1,982,4945,6 +1185,dtype_cast_80,call_function,dtype_cast.default,forward,8,1,1,1,1,4949,3 +1186,permute_97,call_function,permute.default,forward,8,1,1,1,2,4948,3 +1187,alias_default_252,call_function,alias.default,forward,8,1,1,2,3,4947,3 +1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5 +1189,alias_default_251,call_function,alias.default,forward,8,1,1,2,983,4944,4 +1190,alias_default_253,call_function,alias.default,forward,8,1,1,2,975,4944,4 +1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8 +1192,dtype_cast_81,call_function,dtype_cast.default,forward,8,1,1,1,1,4945,3 +1193,permute_98,call_function,permute.default,forward,8,1,1,1,2,4944,3 +1194,alias_default_254,call_function,alias.default,forward,8,1,1,2,991,4942,4 +1195,alias_default_255,call_function,alias.default,forward,8,1,1,2,3,4943,3 +1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5 +1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10 +1198,dtype_cast_82,call_function,dtype_cast.default,forward,9,1,1,1,1,4929,2 +1199,alias_default_256,call_function,alias.default,forward,8,1,1,3,998,4939,4 +1200,convert_element_type_216,call_function,convert_element_type.default,forward,9,1,1,1,999,4937,4 +1201,alias_default_258,call_function,alias.default,forward,9,1,1,2,1000,4936,4 +1202,pow_19,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1001,4935,4 +1203,mean_18,call_function,mean.dim,forward,9,1,1,1,1002,4934,4 +1204,add_45,call_function,add.Scalar,forward,9,1,1,1,1003,4933,3 +1205,rsqrt_18,call_function,rsqrt.default,forward,9,1,1,1,1004,4932,3 +1206,alias_default_259,call_function,alias.default,forward,9,1,1,3,1005,4931,3 +1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8 +1208,alias_default_257,call_function,alias.default,forward,9,1,1,2,2,4928,2 +1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8 +1210,convert_element_type_217,call_function,convert_element_type.default,forward,9,1,1,1,1011,4925,6 +1211,dtype_cast_83,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3 +1212,permute_99,call_function,permute.default,forward,9,1,1,1,2,4911,3 +1213,alias_default_260,call_function,alias.default,forward,9,1,1,6,1012,4924,4 +1214,alias_default_261,call_function,alias.default,forward,9,1,1,2,3,4910,3 +1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 +1216,dtype_cast_84,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3 +1217,permute_100,call_function,permute.default,forward,9,1,1,1,2,4911,3 +1218,alias_default_262,call_function,alias.default,forward,9,1,1,2,3,4910,3 +1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 +1220,dtype_cast_85,call_function,dtype_cast.default,forward,9,1,1,1,1,4905,3 +1221,permute_101,call_function,permute.default,forward,9,1,1,1,2,4904,3 +1222,alias_default_263,call_function,alias.default,forward,9,1,1,2,3,4903,3 +1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5 +1224,view_213,call_function,view.default,forward,9,1,1,1,1018,4907,4 +1225,view_214,call_function,view.default,forward,9,1,1,1,1018,4907,4 +1226,view_215,call_function,view.default,forward,9,1,1,1,1018,4900,4 +1227,convert_element_type_224,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4 +1228,view_216,call_function,view.default,forward,9,1,1,1,1020,4905,4 +1229,view_as_complex_18,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6 +1230,convert_element_type_225,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4 +1231,view_217,call_function,view.default,forward,9,1,1,1,1020,4905,4 +1232,view_as_complex_19,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6 +1233,view_218,call_function,view.default,forward,9,1,1,1,2,4915,3 +1234,alias_default_264,call_function,alias.default,forward,9,1,1,4,3,4914,3 +1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 +1236,view_as_real_18,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6 +1237,view_219,call_function,view.default,forward,9,1,1,1,1026,4901,6 +1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 +1239,view_as_real_19,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6 +1240,view_220,call_function,view.default,forward,9,1,1,1,1026,4901,6 +1241,convert_element_type_226,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6 +1242,convert_element_type_227,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6 +1243,permute_102,call_function,permute.default,forward,9,1,1,1,1028,4899,6 +1244,permute_103,call_function,permute.default,forward,9,1,1,1,1028,4899,6 +1245,permute_104,call_function,permute.default,forward,9,1,1,1,1019,4899,4 +1246,alias_default_265,call_function,alias.default,forward,9,1,1,2,1029,4898,4 +1247,alias_default_266,call_function,alias.default,forward,9,1,1,2,1029,4898,4 +1248,alias_default_267,call_function,alias.default,forward,9,1,1,2,1020,4898,4 +1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2 +1250,getitem_81,call_function,getitem,forward,9,1,1,1,1054,4893,2 +1251,getitem_82,call_function,getitem,forward,9,1,1,1,1054,1054,2 +1252,getitem_87,call_function,getitem,forward,9,1,1,1,1054,1054,1 +1253,getitem_88,call_function,getitem,forward,9,1,1,1,1054,1054,1 +1254,alias_default_268,call_function,alias.default,forward,9,1,1,2,1055,4892,4 +1255,permute_105,call_function,permute.default,forward,9,1,1,1,1056,4891,4 +1256,view_221,call_function,view.default,forward,9,1,1,1,1057,4890,3 +1257,dtype_cast_86,call_function,dtype_cast.default,forward,9,1,1,1,1,4892,3 +1258,permute_106,call_function,permute.default,forward,9,1,1,1,2,4891,3 +1259,alias_default_269,call_function,alias.default,forward,9,1,1,2,1058,4889,4 +1260,alias_default_270,call_function,alias.default,forward,9,1,1,2,3,4890,3 +1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5 +1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10 +1263,dtype_cast_87,call_function,dtype_cast.default,forward,9,1,1,1,1,4876,2 +1264,alias_default_271,call_function,alias.default,forward,9,1,1,3,1065,4886,4 +1265,convert_element_type_230,call_function,convert_element_type.default,forward,9,1,1,1,1066,4884,4 +1266,alias_default_273,call_function,alias.default,forward,9,1,1,2,1067,4883,4 +1267,pow_20,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1068,4882,4 +1268,mean_19,call_function,mean.dim,forward,9,1,1,1,1069,4881,4 +1269,add_47,call_function,add.Scalar,forward,9,1,1,1,1070,4880,3 +1270,rsqrt_19,call_function,rsqrt.default,forward,9,1,1,1,1071,4879,3 +1271,alias_default_274,call_function,alias.default,forward,9,1,1,3,1072,4878,3 +1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8 +1273,alias_default_272,call_function,alias.default,forward,9,1,1,2,2,4875,2 +1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8 +1275,convert_element_type_231,call_function,convert_element_type.default,forward,9,1,1,1,1078,4872,6 +1276,dtype_cast_88,call_function,dtype_cast.default,forward,9,1,1,1,1,4872,3 +1277,permute_107,call_function,permute.default,forward,9,1,1,1,2,4871,3 +1278,alias_default_275,call_function,alias.default,forward,9,1,1,4,1079,4871,4 +1279,alias_default_276,call_function,alias.default,forward,9,1,1,2,3,4870,3 +1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5 +1281,alias_default_277,call_function,alias.default,forward,9,1,1,2,1085,4867,4 +1282,convert_element_type_234,call_function,convert_element_type.default,forward,9,1,1,1,1086,4855,4 +1283,alias_default_278,call_function,alias.default,forward,9,1,1,2,1087,4854,4 +1284,neg_9,call_function,neg.default,forward,9,1,1,1,1088,4853,8 +1285,exp_9,call_function,exp.default,forward,9,1,1,1,1089,4852,6 +1286,add_48,call_function,add.Tensor,forward,9,1,1,1,1090,4851,4 +1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6 +1288,convert_element_type_235,call_function,convert_element_type.default,forward,9,1,1,1,1092,4849,6 +1289,dtype_cast_89,call_function,dtype_cast.default,forward,9,1,1,1,1,4853,3 +1290,permute_108,call_function,permute.default,forward,9,1,1,1,2,4852,3 +1291,alias_default_280,call_function,alias.default,forward,9,1,1,2,3,4851,3 +1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5 +1293,alias_default_279,call_function,alias.default,forward,9,1,1,2,1093,4848,4 +1294,alias_default_281,call_function,alias.default,forward,9,1,1,2,1085,4848,4 +1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8 +1296,dtype_cast_90,call_function,dtype_cast.default,forward,9,1,1,1,1,4849,3 +1297,permute_109,call_function,permute.default,forward,9,1,1,1,2,4848,3 +1298,alias_default_282,call_function,alias.default,forward,9,1,1,2,1101,4846,4 +1299,alias_default_283,call_function,alias.default,forward,9,1,1,2,3,4847,3 +1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5 +1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10 +1302,dtype_cast_91,call_function,dtype_cast.default,forward,10,1,1,1,1,4833,2 +1303,alias_default_284,call_function,alias.default,forward,9,1,1,3,1108,4843,4 +1304,convert_element_type_240,call_function,convert_element_type.default,forward,10,1,1,1,1109,4841,4 +1305,alias_default_286,call_function,alias.default,forward,10,1,1,2,1110,4840,4 +1306,pow_21,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1111,4839,4 +1307,mean_20,call_function,mean.dim,forward,10,1,1,1,1112,4838,4 +1308,add_50,call_function,add.Scalar,forward,10,1,1,1,1113,4837,3 +1309,rsqrt_20,call_function,rsqrt.default,forward,10,1,1,1,1114,4836,3 +1310,alias_default_287,call_function,alias.default,forward,10,1,1,3,1115,4835,3 +1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8 +1312,alias_default_285,call_function,alias.default,forward,10,1,1,2,2,4832,2 +1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8 +1314,convert_element_type_241,call_function,convert_element_type.default,forward,10,1,1,1,1121,4829,6 +1315,dtype_cast_92,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3 +1316,permute_110,call_function,permute.default,forward,10,1,1,1,2,4815,3 +1317,alias_default_288,call_function,alias.default,forward,10,1,1,6,1122,4828,4 +1318,alias_default_289,call_function,alias.default,forward,10,1,1,2,3,4814,3 +1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 +1320,dtype_cast_93,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3 +1321,permute_111,call_function,permute.default,forward,10,1,1,1,2,4815,3 +1322,alias_default_290,call_function,alias.default,forward,10,1,1,2,3,4814,3 +1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 +1324,dtype_cast_94,call_function,dtype_cast.default,forward,10,1,1,1,1,4809,3 +1325,permute_112,call_function,permute.default,forward,10,1,1,1,2,4808,3 +1326,alias_default_291,call_function,alias.default,forward,10,1,1,2,3,4807,3 +1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5 +1328,view_236,call_function,view.default,forward,10,1,1,1,1128,4811,4 +1329,view_237,call_function,view.default,forward,10,1,1,1,1128,4811,4 +1330,view_238,call_function,view.default,forward,10,1,1,1,1128,4804,4 +1331,convert_element_type_248,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4 +1332,view_239,call_function,view.default,forward,10,1,1,1,1130,4809,4 +1333,view_as_complex_20,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6 +1334,convert_element_type_249,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4 +1335,view_240,call_function,view.default,forward,10,1,1,1,1130,4809,4 +1336,view_as_complex_21,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6 +1337,view_241,call_function,view.default,forward,10,1,1,1,2,4819,3 +1338,alias_default_292,call_function,alias.default,forward,10,1,1,4,3,4818,3 +1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 +1340,view_as_real_20,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6 +1341,view_242,call_function,view.default,forward,10,1,1,1,1136,4805,6 +1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 +1343,view_as_real_21,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6 +1344,view_243,call_function,view.default,forward,10,1,1,1,1136,4805,6 +1345,convert_element_type_250,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6 +1346,convert_element_type_251,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6 +1347,permute_113,call_function,permute.default,forward,10,1,1,1,1138,4803,6 +1348,permute_114,call_function,permute.default,forward,10,1,1,1,1138,4803,6 +1349,permute_115,call_function,permute.default,forward,10,1,1,1,1129,4803,4 +1350,alias_default_293,call_function,alias.default,forward,10,1,1,2,1139,4802,4 +1351,alias_default_294,call_function,alias.default,forward,10,1,1,2,1139,4802,4 +1352,alias_default_295,call_function,alias.default,forward,10,1,1,2,1130,4802,4 +1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2 +1354,getitem_90,call_function,getitem,forward,10,1,1,1,1164,4797,2 +1355,getitem_91,call_function,getitem,forward,10,1,1,1,1164,1164,2 +1356,getitem_96,call_function,getitem,forward,10,1,1,1,1164,1164,1 +1357,getitem_97,call_function,getitem,forward,10,1,1,1,1164,1164,1 +1358,alias_default_296,call_function,alias.default,forward,10,1,1,2,1165,4796,4 +1359,permute_116,call_function,permute.default,forward,10,1,1,1,1166,4795,4 +1360,view_244,call_function,view.default,forward,10,1,1,1,1167,4794,3 +1361,dtype_cast_95,call_function,dtype_cast.default,forward,10,1,1,1,1,4796,3 +1362,permute_117,call_function,permute.default,forward,10,1,1,1,2,4795,3 +1363,alias_default_297,call_function,alias.default,forward,10,1,1,2,1168,4793,4 +1364,alias_default_298,call_function,alias.default,forward,10,1,1,2,3,4794,3 +1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5 +1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10 +1367,dtype_cast_96,call_function,dtype_cast.default,forward,10,1,1,1,1,4780,2 +1368,alias_default_299,call_function,alias.default,forward,10,1,1,3,1175,4790,4 +1369,convert_element_type_254,call_function,convert_element_type.default,forward,10,1,1,1,1176,4788,4 +1370,alias_default_301,call_function,alias.default,forward,10,1,1,2,1177,4787,4 +1371,pow_22,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1178,4786,4 +1372,mean_21,call_function,mean.dim,forward,10,1,1,1,1179,4785,4 +1373,add_52,call_function,add.Scalar,forward,10,1,1,1,1180,4784,3 +1374,rsqrt_21,call_function,rsqrt.default,forward,10,1,1,1,1181,4783,3 +1375,alias_default_302,call_function,alias.default,forward,10,1,1,3,1182,4782,3 +1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8 +1377,alias_default_300,call_function,alias.default,forward,10,1,1,2,2,4779,2 +1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8 +1379,convert_element_type_255,call_function,convert_element_type.default,forward,10,1,1,1,1188,4776,6 +1380,dtype_cast_97,call_function,dtype_cast.default,forward,10,1,1,1,1,4776,3 +1381,permute_118,call_function,permute.default,forward,10,1,1,1,2,4775,3 +1382,alias_default_303,call_function,alias.default,forward,10,1,1,4,1189,4775,4 +1383,alias_default_304,call_function,alias.default,forward,10,1,1,2,3,4774,3 +1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5 +1385,alias_default_305,call_function,alias.default,forward,10,1,1,2,1195,4771,4 +1386,convert_element_type_258,call_function,convert_element_type.default,forward,10,1,1,1,1196,4759,4 +1387,alias_default_306,call_function,alias.default,forward,10,1,1,2,1197,4758,4 +1388,neg_10,call_function,neg.default,forward,10,1,1,1,1198,4757,8 +1389,exp_10,call_function,exp.default,forward,10,1,1,1,1199,4756,6 +1390,add_53,call_function,add.Tensor,forward,10,1,1,1,1200,4755,4 +1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6 +1392,convert_element_type_259,call_function,convert_element_type.default,forward,10,1,1,1,1202,4753,6 +1393,dtype_cast_98,call_function,dtype_cast.default,forward,10,1,1,1,1,4757,3 +1394,permute_119,call_function,permute.default,forward,10,1,1,1,2,4756,3 +1395,alias_default_308,call_function,alias.default,forward,10,1,1,2,3,4755,3 +1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5 +1397,alias_default_307,call_function,alias.default,forward,10,1,1,2,1203,4752,4 +1398,alias_default_309,call_function,alias.default,forward,10,1,1,2,1195,4752,4 +1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8 +1400,dtype_cast_99,call_function,dtype_cast.default,forward,10,1,1,1,1,4753,3 +1401,permute_120,call_function,permute.default,forward,10,1,1,1,2,4752,3 +1402,alias_default_310,call_function,alias.default,forward,10,1,1,2,1211,4750,4 +1403,alias_default_311,call_function,alias.default,forward,10,1,1,2,3,4751,3 +1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5 +1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10 +1406,dtype_cast_100,call_function,dtype_cast.default,forward,11,1,1,1,1,4737,2 +1407,alias_default_312,call_function,alias.default,forward,10,1,1,3,1218,4747,4 +1408,convert_element_type_264,call_function,convert_element_type.default,forward,11,1,1,1,1219,4745,4 +1409,alias_default_314,call_function,alias.default,forward,11,1,1,2,1220,4744,4 +1410,pow_23,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1221,4743,4 +1411,mean_22,call_function,mean.dim,forward,11,1,1,1,1222,4742,4 +1412,add_55,call_function,add.Scalar,forward,11,1,1,1,1223,4741,3 +1413,rsqrt_22,call_function,rsqrt.default,forward,11,1,1,1,1224,4740,3 +1414,alias_default_315,call_function,alias.default,forward,11,1,1,3,1225,4739,3 +1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8 +1416,alias_default_313,call_function,alias.default,forward,11,1,1,2,2,4736,2 +1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8 +1418,convert_element_type_265,call_function,convert_element_type.default,forward,11,1,1,1,1231,4733,6 +1419,dtype_cast_101,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3 +1420,permute_121,call_function,permute.default,forward,11,1,1,1,2,4719,3 +1421,alias_default_316,call_function,alias.default,forward,11,1,1,6,1232,4732,4 +1422,alias_default_317,call_function,alias.default,forward,11,1,1,2,3,4718,3 +1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 +1424,dtype_cast_102,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3 +1425,permute_122,call_function,permute.default,forward,11,1,1,1,2,4719,3 +1426,alias_default_318,call_function,alias.default,forward,11,1,1,2,3,4718,3 +1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 +1428,dtype_cast_103,call_function,dtype_cast.default,forward,11,1,1,1,1,4713,3 +1429,permute_123,call_function,permute.default,forward,11,1,1,1,2,4712,3 +1430,alias_default_319,call_function,alias.default,forward,11,1,1,2,3,4711,3 +1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5 +1432,view_259,call_function,view.default,forward,11,1,1,1,1238,4715,4 +1433,view_260,call_function,view.default,forward,11,1,1,1,1238,4715,4 +1434,view_261,call_function,view.default,forward,11,1,1,1,1238,4708,4 +1435,convert_element_type_272,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4 +1436,view_262,call_function,view.default,forward,11,1,1,1,1240,4713,4 +1437,view_as_complex_22,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6 +1438,convert_element_type_273,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4 +1439,view_263,call_function,view.default,forward,11,1,1,1,1240,4713,4 +1440,view_as_complex_23,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6 +1441,view_264,call_function,view.default,forward,11,1,1,1,2,4723,3 +1442,alias_default_320,call_function,alias.default,forward,11,1,1,4,3,4722,3 +1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 +1444,view_as_real_22,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6 +1445,view_265,call_function,view.default,forward,11,1,1,1,1246,4709,6 +1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 +1447,view_as_real_23,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6 +1448,view_266,call_function,view.default,forward,11,1,1,1,1246,4709,6 +1449,convert_element_type_274,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6 +1450,convert_element_type_275,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6 +1451,permute_124,call_function,permute.default,forward,11,1,1,1,1248,4707,6 +1452,permute_125,call_function,permute.default,forward,11,1,1,1,1248,4707,6 +1453,permute_126,call_function,permute.default,forward,11,1,1,1,1239,4707,4 +1454,alias_default_321,call_function,alias.default,forward,11,1,1,2,1249,4706,4 +1455,alias_default_322,call_function,alias.default,forward,11,1,1,2,1249,4706,4 +1456,alias_default_323,call_function,alias.default,forward,11,1,1,2,1240,4706,4 +1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2 +1458,getitem_99,call_function,getitem,forward,11,1,1,1,1274,4701,2 +1459,getitem_100,call_function,getitem,forward,11,1,1,1,1274,1274,2 +1460,getitem_105,call_function,getitem,forward,11,1,1,1,1274,1274,1 +1461,getitem_106,call_function,getitem,forward,11,1,1,1,1274,1274,1 +1462,alias_default_324,call_function,alias.default,forward,11,1,1,2,1275,4700,4 +1463,permute_127,call_function,permute.default,forward,11,1,1,1,1276,4699,4 +1464,view_267,call_function,view.default,forward,11,1,1,1,1277,4698,3 +1465,dtype_cast_104,call_function,dtype_cast.default,forward,11,1,1,1,1,4700,3 +1466,permute_128,call_function,permute.default,forward,11,1,1,1,2,4699,3 +1467,alias_default_325,call_function,alias.default,forward,11,1,1,2,1278,4697,4 +1468,alias_default_326,call_function,alias.default,forward,11,1,1,2,3,4698,3 +1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5 +1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10 +1471,dtype_cast_105,call_function,dtype_cast.default,forward,11,1,1,1,1,4684,2 +1472,alias_default_327,call_function,alias.default,forward,11,1,1,3,1285,4694,4 +1473,convert_element_type_278,call_function,convert_element_type.default,forward,11,1,1,1,1286,4692,4 +1474,alias_default_329,call_function,alias.default,forward,11,1,1,2,1287,4691,4 +1475,pow_24,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1288,4690,4 +1476,mean_23,call_function,mean.dim,forward,11,1,1,1,1289,4689,4 +1477,add_57,call_function,add.Scalar,forward,11,1,1,1,1290,4688,3 +1478,rsqrt_23,call_function,rsqrt.default,forward,11,1,1,1,1291,4687,3 +1479,alias_default_330,call_function,alias.default,forward,11,1,1,3,1292,4686,3 +1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8 +1481,alias_default_328,call_function,alias.default,forward,11,1,1,2,2,4683,2 +1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8 +1483,convert_element_type_279,call_function,convert_element_type.default,forward,11,1,1,1,1298,4680,6 +1484,dtype_cast_106,call_function,dtype_cast.default,forward,11,1,1,1,1,4680,3 +1485,permute_129,call_function,permute.default,forward,11,1,1,1,2,4679,3 +1486,alias_default_331,call_function,alias.default,forward,11,1,1,4,1299,4679,4 +1487,alias_default_332,call_function,alias.default,forward,11,1,1,2,3,4678,3 +1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5 +1489,alias_default_333,call_function,alias.default,forward,11,1,1,2,1305,4675,4 +1490,convert_element_type_282,call_function,convert_element_type.default,forward,11,1,1,1,1306,4663,4 +1491,alias_default_334,call_function,alias.default,forward,11,1,1,2,1307,4662,4 +1492,neg_11,call_function,neg.default,forward,11,1,1,1,1308,4661,8 +1493,exp_11,call_function,exp.default,forward,11,1,1,1,1309,4660,6 +1494,add_58,call_function,add.Tensor,forward,11,1,1,1,1310,4659,4 +1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6 +1496,convert_element_type_283,call_function,convert_element_type.default,forward,11,1,1,1,1312,4657,6 +1497,dtype_cast_107,call_function,dtype_cast.default,forward,11,1,1,1,1,4661,3 +1498,permute_130,call_function,permute.default,forward,11,1,1,1,2,4660,3 +1499,alias_default_336,call_function,alias.default,forward,11,1,1,2,3,4659,3 +1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5 +1501,alias_default_335,call_function,alias.default,forward,11,1,1,2,1313,4656,4 +1502,alias_default_337,call_function,alias.default,forward,11,1,1,2,1305,4656,4 +1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8 +1504,dtype_cast_108,call_function,dtype_cast.default,forward,11,1,1,1,1,4657,3 +1505,permute_131,call_function,permute.default,forward,11,1,1,1,2,4656,3 +1506,alias_default_338,call_function,alias.default,forward,11,1,1,2,1321,4654,4 +1507,alias_default_339,call_function,alias.default,forward,11,1,1,2,3,4655,3 +1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5 +1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10 +1510,dtype_cast_109,call_function,dtype_cast.default,forward,12,1,1,1,1,4641,2 +1511,alias_default_340,call_function,alias.default,forward,11,1,1,3,1328,4651,4 +1512,convert_element_type_288,call_function,convert_element_type.default,forward,12,1,1,1,1329,4649,4 +1513,alias_default_342,call_function,alias.default,forward,12,1,1,2,1330,4648,4 +1514,pow_25,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1331,4647,4 +1515,mean_24,call_function,mean.dim,forward,12,1,1,1,1332,4646,4 +1516,add_60,call_function,add.Scalar,forward,12,1,1,1,1333,4645,3 +1517,rsqrt_24,call_function,rsqrt.default,forward,12,1,1,1,1334,4644,3 +1518,alias_default_343,call_function,alias.default,forward,12,1,1,3,1335,4643,3 +1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8 +1520,alias_default_341,call_function,alias.default,forward,12,1,1,2,2,4640,2 +1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8 +1522,convert_element_type_289,call_function,convert_element_type.default,forward,12,1,1,1,1341,4637,6 +1523,dtype_cast_110,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3 +1524,permute_132,call_function,permute.default,forward,12,1,1,1,2,4623,3 +1525,alias_default_344,call_function,alias.default,forward,12,1,1,6,1342,4636,4 +1526,alias_default_345,call_function,alias.default,forward,12,1,1,2,3,4622,3 +1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 +1528,dtype_cast_111,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3 +1529,permute_133,call_function,permute.default,forward,12,1,1,1,2,4623,3 +1530,alias_default_346,call_function,alias.default,forward,12,1,1,2,3,4622,3 +1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 +1532,dtype_cast_112,call_function,dtype_cast.default,forward,12,1,1,1,1,4617,3 +1533,permute_134,call_function,permute.default,forward,12,1,1,1,2,4616,3 +1534,alias_default_347,call_function,alias.default,forward,12,1,1,2,3,4615,3 +1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5 +1536,view_282,call_function,view.default,forward,12,1,1,1,1348,4619,4 +1537,view_283,call_function,view.default,forward,12,1,1,1,1348,4619,4 +1538,view_284,call_function,view.default,forward,12,1,1,1,1348,4612,4 +1539,convert_element_type_296,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4 +1540,view_285,call_function,view.default,forward,12,1,1,1,1350,4617,4 +1541,view_as_complex_24,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6 +1542,convert_element_type_297,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4 +1543,view_286,call_function,view.default,forward,12,1,1,1,1350,4617,4 +1544,view_as_complex_25,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6 +1545,view_287,call_function,view.default,forward,12,1,1,1,2,4627,3 +1546,alias_default_348,call_function,alias.default,forward,12,1,1,4,3,4626,3 +1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 +1548,view_as_real_24,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6 +1549,view_288,call_function,view.default,forward,12,1,1,1,1356,4613,6 +1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 +1551,view_as_real_25,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6 +1552,view_289,call_function,view.default,forward,12,1,1,1,1356,4613,6 +1553,convert_element_type_298,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6 +1554,convert_element_type_299,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6 +1555,permute_135,call_function,permute.default,forward,12,1,1,1,1358,4611,6 +1556,permute_136,call_function,permute.default,forward,12,1,1,1,1358,4611,6 +1557,permute_137,call_function,permute.default,forward,12,1,1,1,1349,4611,4 +1558,alias_default_349,call_function,alias.default,forward,12,1,1,2,1359,4610,4 +1559,alias_default_350,call_function,alias.default,forward,12,1,1,2,1359,4610,4 +1560,alias_default_351,call_function,alias.default,forward,12,1,1,2,1350,4610,4 +1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2 +1562,getitem_108,call_function,getitem,forward,12,1,1,1,1384,4605,2 +1563,getitem_109,call_function,getitem,forward,12,1,1,1,1384,1384,2 +1564,getitem_114,call_function,getitem,forward,12,1,1,1,1384,1384,1 +1565,getitem_115,call_function,getitem,forward,12,1,1,1,1384,1384,1 +1566,alias_default_352,call_function,alias.default,forward,12,1,1,2,1385,4604,4 +1567,permute_138,call_function,permute.default,forward,12,1,1,1,1386,4603,4 +1568,view_290,call_function,view.default,forward,12,1,1,1,1387,4602,3 +1569,dtype_cast_113,call_function,dtype_cast.default,forward,12,1,1,1,1,4604,3 +1570,permute_139,call_function,permute.default,forward,12,1,1,1,2,4603,3 +1571,alias_default_353,call_function,alias.default,forward,12,1,1,2,1388,4601,4 +1572,alias_default_354,call_function,alias.default,forward,12,1,1,2,3,4602,3 +1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5 +1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10 +1575,dtype_cast_114,call_function,dtype_cast.default,forward,12,1,1,1,1,4588,2 +1576,alias_default_355,call_function,alias.default,forward,12,1,1,3,1395,4598,4 +1577,convert_element_type_302,call_function,convert_element_type.default,forward,12,1,1,1,1396,4596,4 +1578,alias_default_357,call_function,alias.default,forward,12,1,1,2,1397,4595,4 +1579,pow_26,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1398,4594,4 +1580,mean_25,call_function,mean.dim,forward,12,1,1,1,1399,4593,4 +1581,add_62,call_function,add.Scalar,forward,12,1,1,1,1400,4592,3 +1582,rsqrt_25,call_function,rsqrt.default,forward,12,1,1,1,1401,4591,3 +1583,alias_default_358,call_function,alias.default,forward,12,1,1,3,1402,4590,3 +1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8 +1585,alias_default_356,call_function,alias.default,forward,12,1,1,2,2,4587,2 +1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8 +1587,convert_element_type_303,call_function,convert_element_type.default,forward,12,1,1,1,1408,4584,6 +1588,dtype_cast_115,call_function,dtype_cast.default,forward,12,1,1,1,1,4584,3 +1589,permute_140,call_function,permute.default,forward,12,1,1,1,2,4583,3 +1590,alias_default_359,call_function,alias.default,forward,12,1,1,4,1409,4583,4 +1591,alias_default_360,call_function,alias.default,forward,12,1,1,2,3,4582,3 +1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5 +1593,alias_default_361,call_function,alias.default,forward,12,1,1,2,1415,4579,4 +1594,convert_element_type_306,call_function,convert_element_type.default,forward,12,1,1,1,1416,4567,4 +1595,alias_default_362,call_function,alias.default,forward,12,1,1,2,1417,4566,4 +1596,neg_12,call_function,neg.default,forward,12,1,1,1,1418,4565,8 +1597,exp_12,call_function,exp.default,forward,12,1,1,1,1419,4564,6 +1598,add_63,call_function,add.Tensor,forward,12,1,1,1,1420,4563,4 +1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6 +1600,convert_element_type_307,call_function,convert_element_type.default,forward,12,1,1,1,1422,4561,6 +1601,dtype_cast_116,call_function,dtype_cast.default,forward,12,1,1,1,1,4565,3 +1602,permute_141,call_function,permute.default,forward,12,1,1,1,2,4564,3 +1603,alias_default_364,call_function,alias.default,forward,12,1,1,2,3,4563,3 +1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5 +1605,alias_default_363,call_function,alias.default,forward,12,1,1,2,1423,4560,4 +1606,alias_default_365,call_function,alias.default,forward,12,1,1,2,1415,4560,4 +1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8 +1608,dtype_cast_117,call_function,dtype_cast.default,forward,12,1,1,1,1,4561,3 +1609,permute_142,call_function,permute.default,forward,12,1,1,1,2,4560,3 +1610,alias_default_366,call_function,alias.default,forward,12,1,1,2,1431,4558,4 +1611,alias_default_367,call_function,alias.default,forward,12,1,1,2,3,4559,3 +1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5 +1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10 +1614,dtype_cast_118,call_function,dtype_cast.default,forward,13,1,1,1,1,4545,2 +1615,alias_default_368,call_function,alias.default,forward,12,1,1,3,1438,4555,4 +1616,convert_element_type_312,call_function,convert_element_type.default,forward,13,1,1,1,1439,4553,4 +1617,alias_default_370,call_function,alias.default,forward,13,1,1,2,1440,4552,4 +1618,pow_27,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1441,4551,4 +1619,mean_26,call_function,mean.dim,forward,13,1,1,1,1442,4550,4 +1620,add_65,call_function,add.Scalar,forward,13,1,1,1,1443,4549,3 +1621,rsqrt_26,call_function,rsqrt.default,forward,13,1,1,1,1444,4548,3 +1622,alias_default_371,call_function,alias.default,forward,13,1,1,3,1445,4547,3 +1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8 +1624,alias_default_369,call_function,alias.default,forward,13,1,1,2,2,4544,2 +1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8 +1626,convert_element_type_313,call_function,convert_element_type.default,forward,13,1,1,1,1451,4541,6 +1627,dtype_cast_119,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3 +1628,permute_143,call_function,permute.default,forward,13,1,1,1,2,4527,3 +1629,alias_default_372,call_function,alias.default,forward,13,1,1,6,1452,4540,4 +1630,alias_default_373,call_function,alias.default,forward,13,1,1,2,3,4526,3 +1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 +1632,dtype_cast_120,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3 +1633,permute_144,call_function,permute.default,forward,13,1,1,1,2,4527,3 +1634,alias_default_374,call_function,alias.default,forward,13,1,1,2,3,4526,3 +1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 +1636,dtype_cast_121,call_function,dtype_cast.default,forward,13,1,1,1,1,4521,3 +1637,permute_145,call_function,permute.default,forward,13,1,1,1,2,4520,3 +1638,alias_default_375,call_function,alias.default,forward,13,1,1,2,3,4519,3 +1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5 +1640,view_305,call_function,view.default,forward,13,1,1,1,1458,4523,4 +1641,view_306,call_function,view.default,forward,13,1,1,1,1458,4523,4 +1642,view_307,call_function,view.default,forward,13,1,1,1,1458,4516,4 +1643,convert_element_type_320,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4 +1644,view_308,call_function,view.default,forward,13,1,1,1,1460,4521,4 +1645,view_as_complex_26,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6 +1646,convert_element_type_321,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4 +1647,view_309,call_function,view.default,forward,13,1,1,1,1460,4521,4 +1648,view_as_complex_27,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6 +1649,view_310,call_function,view.default,forward,13,1,1,1,2,4531,3 +1650,alias_default_376,call_function,alias.default,forward,13,1,1,4,3,4530,3 +1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 +1652,view_as_real_26,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6 +1653,view_311,call_function,view.default,forward,13,1,1,1,1466,4517,6 +1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 +1655,view_as_real_27,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6 +1656,view_312,call_function,view.default,forward,13,1,1,1,1466,4517,6 +1657,convert_element_type_322,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6 +1658,convert_element_type_323,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6 +1659,permute_146,call_function,permute.default,forward,13,1,1,1,1468,4515,6 +1660,permute_147,call_function,permute.default,forward,13,1,1,1,1468,4515,6 +1661,permute_148,call_function,permute.default,forward,13,1,1,1,1459,4515,4 +1662,alias_default_377,call_function,alias.default,forward,13,1,1,2,1469,4514,4 +1663,alias_default_378,call_function,alias.default,forward,13,1,1,2,1469,4514,4 +1664,alias_default_379,call_function,alias.default,forward,13,1,1,2,1460,4514,4 +1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2 +1666,getitem_117,call_function,getitem,forward,13,1,1,1,1494,4509,2 +1667,getitem_118,call_function,getitem,forward,13,1,1,1,1494,1494,2 +1668,getitem_123,call_function,getitem,forward,13,1,1,1,1494,1494,1 +1669,getitem_124,call_function,getitem,forward,13,1,1,1,1494,1494,1 +1670,alias_default_380,call_function,alias.default,forward,13,1,1,2,1495,4508,4 +1671,permute_149,call_function,permute.default,forward,13,1,1,1,1496,4507,4 +1672,view_313,call_function,view.default,forward,13,1,1,1,1497,4506,3 +1673,dtype_cast_122,call_function,dtype_cast.default,forward,13,1,1,1,1,4508,3 +1674,permute_150,call_function,permute.default,forward,13,1,1,1,2,4507,3 +1675,alias_default_381,call_function,alias.default,forward,13,1,1,2,1498,4505,4 +1676,alias_default_382,call_function,alias.default,forward,13,1,1,2,3,4506,3 +1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5 +1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10 +1679,dtype_cast_123,call_function,dtype_cast.default,forward,13,1,1,1,1,4492,2 +1680,alias_default_383,call_function,alias.default,forward,13,1,1,3,1505,4502,4 +1681,convert_element_type_326,call_function,convert_element_type.default,forward,13,1,1,1,1506,4500,4 +1682,alias_default_385,call_function,alias.default,forward,13,1,1,2,1507,4499,4 +1683,pow_28,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1508,4498,4 +1684,mean_27,call_function,mean.dim,forward,13,1,1,1,1509,4497,4 +1685,add_67,call_function,add.Scalar,forward,13,1,1,1,1510,4496,3 +1686,rsqrt_27,call_function,rsqrt.default,forward,13,1,1,1,1511,4495,3 +1687,alias_default_386,call_function,alias.default,forward,13,1,1,3,1512,4494,3 +1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8 +1689,alias_default_384,call_function,alias.default,forward,13,1,1,2,2,4491,2 +1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8 +1691,convert_element_type_327,call_function,convert_element_type.default,forward,13,1,1,1,1518,4488,6 +1692,dtype_cast_124,call_function,dtype_cast.default,forward,13,1,1,1,1,4488,3 +1693,permute_151,call_function,permute.default,forward,13,1,1,1,2,4487,3 +1694,alias_default_387,call_function,alias.default,forward,13,1,1,4,1519,4487,4 +1695,alias_default_388,call_function,alias.default,forward,13,1,1,2,3,4486,3 +1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5 +1697,alias_default_389,call_function,alias.default,forward,13,1,1,2,1525,4483,4 +1698,convert_element_type_330,call_function,convert_element_type.default,forward,13,1,1,1,1526,4471,4 +1699,alias_default_390,call_function,alias.default,forward,13,1,1,2,1527,4470,4 +1700,neg_13,call_function,neg.default,forward,13,1,1,1,1528,4469,8 +1701,exp_13,call_function,exp.default,forward,13,1,1,1,1529,4468,6 +1702,add_68,call_function,add.Tensor,forward,13,1,1,1,1530,4467,4 +1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6 +1704,convert_element_type_331,call_function,convert_element_type.default,forward,13,1,1,1,1532,4465,6 +1705,dtype_cast_125,call_function,dtype_cast.default,forward,13,1,1,1,1,4469,3 +1706,permute_152,call_function,permute.default,forward,13,1,1,1,2,4468,3 +1707,alias_default_392,call_function,alias.default,forward,13,1,1,2,3,4467,3 +1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5 +1709,alias_default_391,call_function,alias.default,forward,13,1,1,2,1533,4464,4 +1710,alias_default_393,call_function,alias.default,forward,13,1,1,2,1525,4464,4 +1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8 +1712,dtype_cast_126,call_function,dtype_cast.default,forward,13,1,1,1,1,4465,3 +1713,permute_153,call_function,permute.default,forward,13,1,1,1,2,4464,3 +1714,alias_default_394,call_function,alias.default,forward,13,1,1,2,1541,4462,4 +1715,alias_default_395,call_function,alias.default,forward,13,1,1,2,3,4463,3 +1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5 +1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10 +1718,dtype_cast_127,call_function,dtype_cast.default,forward,14,1,1,1,1,4449,2 +1719,alias_default_396,call_function,alias.default,forward,13,1,1,3,1548,4459,4 +1720,convert_element_type_336,call_function,convert_element_type.default,forward,14,1,1,1,1549,4457,4 +1721,alias_default_398,call_function,alias.default,forward,14,1,1,2,1550,4456,4 +1722,pow_29,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1551,4455,4 +1723,mean_28,call_function,mean.dim,forward,14,1,1,1,1552,4454,4 +1724,add_70,call_function,add.Scalar,forward,14,1,1,1,1553,4453,3 +1725,rsqrt_28,call_function,rsqrt.default,forward,14,1,1,1,1554,4452,3 +1726,alias_default_399,call_function,alias.default,forward,14,1,1,3,1555,4451,3 +1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8 +1728,alias_default_397,call_function,alias.default,forward,14,1,1,2,2,4448,2 +1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8 +1730,convert_element_type_337,call_function,convert_element_type.default,forward,14,1,1,1,1561,4445,6 +1731,dtype_cast_128,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3 +1732,permute_154,call_function,permute.default,forward,14,1,1,1,2,4431,3 +1733,alias_default_400,call_function,alias.default,forward,14,1,1,6,1562,4444,4 +1734,alias_default_401,call_function,alias.default,forward,14,1,1,2,3,4430,3 +1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 +1736,dtype_cast_129,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3 +1737,permute_155,call_function,permute.default,forward,14,1,1,1,2,4431,3 +1738,alias_default_402,call_function,alias.default,forward,14,1,1,2,3,4430,3 +1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 +1740,dtype_cast_130,call_function,dtype_cast.default,forward,14,1,1,1,1,4425,3 +1741,permute_156,call_function,permute.default,forward,14,1,1,1,2,4424,3 +1742,alias_default_403,call_function,alias.default,forward,14,1,1,2,3,4423,3 +1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5 +1744,view_328,call_function,view.default,forward,14,1,1,1,1568,4427,4 +1745,view_329,call_function,view.default,forward,14,1,1,1,1568,4427,4 +1746,view_330,call_function,view.default,forward,14,1,1,1,1568,4420,4 +1747,convert_element_type_344,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4 +1748,view_331,call_function,view.default,forward,14,1,1,1,1570,4425,4 +1749,view_as_complex_28,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6 +1750,convert_element_type_345,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4 +1751,view_332,call_function,view.default,forward,14,1,1,1,1570,4425,4 +1752,view_as_complex_29,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6 +1753,view_333,call_function,view.default,forward,14,1,1,1,2,4435,3 +1754,alias_default_404,call_function,alias.default,forward,14,1,1,4,3,4434,3 +1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 +1756,view_as_real_28,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6 +1757,view_334,call_function,view.default,forward,14,1,1,1,1576,4421,6 +1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 +1759,view_as_real_29,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6 +1760,view_335,call_function,view.default,forward,14,1,1,1,1576,4421,6 +1761,convert_element_type_346,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6 +1762,convert_element_type_347,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6 +1763,permute_157,call_function,permute.default,forward,14,1,1,1,1578,4419,6 +1764,permute_158,call_function,permute.default,forward,14,1,1,1,1578,4419,6 +1765,permute_159,call_function,permute.default,forward,14,1,1,1,1569,4419,4 +1766,alias_default_405,call_function,alias.default,forward,14,1,1,2,1579,4418,4 +1767,alias_default_406,call_function,alias.default,forward,14,1,1,2,1579,4418,4 +1768,alias_default_407,call_function,alias.default,forward,14,1,1,2,1570,4418,4 +1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2 +1770,getitem_126,call_function,getitem,forward,14,1,1,1,1604,4413,2 +1771,getitem_127,call_function,getitem,forward,14,1,1,1,1604,1604,2 +1772,getitem_132,call_function,getitem,forward,14,1,1,1,1604,1604,1 +1773,getitem_133,call_function,getitem,forward,14,1,1,1,1604,1604,1 +1774,alias_default_408,call_function,alias.default,forward,14,1,1,2,1605,4412,4 +1775,permute_160,call_function,permute.default,forward,14,1,1,1,1606,4411,4 +1776,view_336,call_function,view.default,forward,14,1,1,1,1607,4410,3 +1777,dtype_cast_131,call_function,dtype_cast.default,forward,14,1,1,1,1,4412,3 +1778,permute_161,call_function,permute.default,forward,14,1,1,1,2,4411,3 +1779,alias_default_409,call_function,alias.default,forward,14,1,1,2,1608,4409,4 +1780,alias_default_410,call_function,alias.default,forward,14,1,1,2,3,4410,3 +1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5 +1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10 +1783,dtype_cast_132,call_function,dtype_cast.default,forward,14,1,1,1,1,4396,2 +1784,alias_default_411,call_function,alias.default,forward,14,1,1,3,1615,4406,4 +1785,convert_element_type_350,call_function,convert_element_type.default,forward,14,1,1,1,1616,4404,4 +1786,alias_default_413,call_function,alias.default,forward,14,1,1,2,1617,4403,4 +1787,pow_30,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1618,4402,4 +1788,mean_29,call_function,mean.dim,forward,14,1,1,1,1619,4401,4 +1789,add_72,call_function,add.Scalar,forward,14,1,1,1,1620,4400,3 +1790,rsqrt_29,call_function,rsqrt.default,forward,14,1,1,1,1621,4399,3 +1791,alias_default_414,call_function,alias.default,forward,14,1,1,3,1622,4398,3 +1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8 +1793,alias_default_412,call_function,alias.default,forward,14,1,1,2,2,4395,2 +1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8 +1795,convert_element_type_351,call_function,convert_element_type.default,forward,14,1,1,1,1628,4392,6 +1796,dtype_cast_133,call_function,dtype_cast.default,forward,14,1,1,1,1,4392,3 +1797,permute_162,call_function,permute.default,forward,14,1,1,1,2,4391,3 +1798,alias_default_415,call_function,alias.default,forward,14,1,1,4,1629,4391,4 +1799,alias_default_416,call_function,alias.default,forward,14,1,1,2,3,4390,3 +1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5 +1801,alias_default_417,call_function,alias.default,forward,14,1,1,2,1635,4387,4 +1802,convert_element_type_354,call_function,convert_element_type.default,forward,14,1,1,1,1636,4375,4 +1803,alias_default_418,call_function,alias.default,forward,14,1,1,2,1637,4374,4 +1804,neg_14,call_function,neg.default,forward,14,1,1,1,1638,4373,8 +1805,exp_14,call_function,exp.default,forward,14,1,1,1,1639,4372,6 +1806,add_73,call_function,add.Tensor,forward,14,1,1,1,1640,4371,4 +1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6 +1808,convert_element_type_355,call_function,convert_element_type.default,forward,14,1,1,1,1642,4369,6 +1809,dtype_cast_134,call_function,dtype_cast.default,forward,14,1,1,1,1,4373,3 +1810,permute_163,call_function,permute.default,forward,14,1,1,1,2,4372,3 +1811,alias_default_420,call_function,alias.default,forward,14,1,1,2,3,4371,3 +1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5 +1813,alias_default_419,call_function,alias.default,forward,14,1,1,2,1643,4368,4 +1814,alias_default_421,call_function,alias.default,forward,14,1,1,2,1635,4368,4 +1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8 +1816,dtype_cast_135,call_function,dtype_cast.default,forward,14,1,1,1,1,4369,3 +1817,permute_164,call_function,permute.default,forward,14,1,1,1,2,4368,3 +1818,alias_default_422,call_function,alias.default,forward,14,1,1,2,1651,4366,4 +1819,alias_default_423,call_function,alias.default,forward,14,1,1,2,3,4367,3 +1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5 +1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10 +1822,dtype_cast_136,call_function,dtype_cast.default,forward,15,1,1,1,1,4353,2 +1823,alias_default_424,call_function,alias.default,forward,14,1,1,3,1658,4363,4 +1824,convert_element_type_360,call_function,convert_element_type.default,forward,15,1,1,1,1659,4361,4 +1825,alias_default_426,call_function,alias.default,forward,15,1,1,2,1660,4360,4 +1826,pow_31,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1661,4359,4 +1827,mean_30,call_function,mean.dim,forward,15,1,1,1,1662,4358,4 +1828,add_75,call_function,add.Scalar,forward,15,1,1,1,1663,4357,3 +1829,rsqrt_30,call_function,rsqrt.default,forward,15,1,1,1,1664,4356,3 +1830,alias_default_427,call_function,alias.default,forward,15,1,1,3,1665,4355,3 +1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8 +1832,alias_default_425,call_function,alias.default,forward,15,1,1,2,2,4352,2 +1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8 +1834,convert_element_type_361,call_function,convert_element_type.default,forward,15,1,1,1,1671,4349,6 +1835,dtype_cast_137,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3 +1836,permute_165,call_function,permute.default,forward,15,1,1,1,2,4335,3 +1837,alias_default_428,call_function,alias.default,forward,15,1,1,6,1672,4348,4 +1838,alias_default_429,call_function,alias.default,forward,15,1,1,2,3,4334,3 +1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 +1840,dtype_cast_138,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3 +1841,permute_166,call_function,permute.default,forward,15,1,1,1,2,4335,3 +1842,alias_default_430,call_function,alias.default,forward,15,1,1,2,3,4334,3 +1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 +1844,dtype_cast_139,call_function,dtype_cast.default,forward,15,1,1,1,1,4329,3 +1845,permute_167,call_function,permute.default,forward,15,1,1,1,2,4328,3 +1846,alias_default_431,call_function,alias.default,forward,15,1,1,2,3,4327,3 +1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5 +1848,view_351,call_function,view.default,forward,15,1,1,1,1678,4331,4 +1849,view_352,call_function,view.default,forward,15,1,1,1,1678,4331,4 +1850,view_353,call_function,view.default,forward,15,1,1,1,1678,4324,4 +1851,convert_element_type_368,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4 +1852,view_354,call_function,view.default,forward,15,1,1,1,1680,4329,4 +1853,view_as_complex_30,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6 +1854,convert_element_type_369,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4 +1855,view_355,call_function,view.default,forward,15,1,1,1,1680,4329,4 +1856,view_as_complex_31,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6 +1857,view_356,call_function,view.default,forward,15,1,1,1,2,4339,3 +1858,alias_default_432,call_function,alias.default,forward,15,1,1,4,3,4338,3 +1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 +1860,view_as_real_30,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6 +1861,view_357,call_function,view.default,forward,15,1,1,1,1686,4325,6 +1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 +1863,view_as_real_31,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6 +1864,view_358,call_function,view.default,forward,15,1,1,1,1686,4325,6 +1865,convert_element_type_370,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6 +1866,convert_element_type_371,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6 +1867,permute_168,call_function,permute.default,forward,15,1,1,1,1688,4323,6 +1868,permute_169,call_function,permute.default,forward,15,1,1,1,1688,4323,6 +1869,permute_170,call_function,permute.default,forward,15,1,1,1,1679,4323,4 +1870,alias_default_433,call_function,alias.default,forward,15,1,1,2,1689,4322,4 +1871,alias_default_434,call_function,alias.default,forward,15,1,1,2,1689,4322,4 +1872,alias_default_435,call_function,alias.default,forward,15,1,1,2,1680,4322,4 +1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2 +1874,getitem_135,call_function,getitem,forward,15,1,1,1,1714,4317,2 +1875,getitem_136,call_function,getitem,forward,15,1,1,1,1714,1714,2 +1876,getitem_141,call_function,getitem,forward,15,1,1,1,1714,1714,1 +1877,getitem_142,call_function,getitem,forward,15,1,1,1,1714,1714,1 +1878,alias_default_436,call_function,alias.default,forward,15,1,1,2,1715,4316,4 +1879,permute_171,call_function,permute.default,forward,15,1,1,1,1716,4315,4 +1880,view_359,call_function,view.default,forward,15,1,1,1,1717,4314,3 +1881,dtype_cast_140,call_function,dtype_cast.default,forward,15,1,1,1,1,4316,3 +1882,permute_172,call_function,permute.default,forward,15,1,1,1,2,4315,3 +1883,alias_default_437,call_function,alias.default,forward,15,1,1,2,1718,4313,4 +1884,alias_default_438,call_function,alias.default,forward,15,1,1,2,3,4314,3 +1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5 +1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10 +1887,dtype_cast_141,call_function,dtype_cast.default,forward,15,1,1,1,1,4300,2 +1888,alias_default_439,call_function,alias.default,forward,15,1,1,3,1725,4310,4 +1889,convert_element_type_374,call_function,convert_element_type.default,forward,15,1,1,1,1726,4308,4 +1890,alias_default_441,call_function,alias.default,forward,15,1,1,2,1727,4307,4 +1891,pow_32,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1728,4306,4 +1892,mean_31,call_function,mean.dim,forward,15,1,1,1,1729,4305,4 +1893,add_77,call_function,add.Scalar,forward,15,1,1,1,1730,4304,3 +1894,rsqrt_31,call_function,rsqrt.default,forward,15,1,1,1,1731,4303,3 +1895,alias_default_442,call_function,alias.default,forward,15,1,1,3,1732,4302,3 +1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8 +1897,alias_default_440,call_function,alias.default,forward,15,1,1,2,2,4299,2 +1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8 +1899,convert_element_type_375,call_function,convert_element_type.default,forward,15,1,1,1,1738,4296,6 +1900,dtype_cast_142,call_function,dtype_cast.default,forward,15,1,1,1,1,4296,3 +1901,permute_173,call_function,permute.default,forward,15,1,1,1,2,4295,3 +1902,alias_default_443,call_function,alias.default,forward,15,1,1,4,1739,4295,4 +1903,alias_default_444,call_function,alias.default,forward,15,1,1,2,3,4294,3 +1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5 +1905,alias_default_445,call_function,alias.default,forward,15,1,1,2,1745,4291,4 +1906,convert_element_type_378,call_function,convert_element_type.default,forward,15,1,1,1,1746,4279,4 +1907,alias_default_446,call_function,alias.default,forward,15,1,1,2,1747,4278,4 +1908,neg_15,call_function,neg.default,forward,15,1,1,1,1748,4277,8 +1909,exp_15,call_function,exp.default,forward,15,1,1,1,1749,4276,6 +1910,add_78,call_function,add.Tensor,forward,15,1,1,1,1750,4275,4 +1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6 +1912,convert_element_type_379,call_function,convert_element_type.default,forward,15,1,1,1,1752,4273,6 +1913,dtype_cast_143,call_function,dtype_cast.default,forward,15,1,1,1,1,4277,3 +1914,permute_174,call_function,permute.default,forward,15,1,1,1,2,4276,3 +1915,alias_default_448,call_function,alias.default,forward,15,1,1,2,3,4275,3 +1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5 +1917,alias_default_447,call_function,alias.default,forward,15,1,1,2,1753,4272,4 +1918,alias_default_449,call_function,alias.default,forward,15,1,1,2,1745,4272,4 +1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8 +1920,dtype_cast_144,call_function,dtype_cast.default,forward,15,1,1,1,1,4273,3 +1921,permute_175,call_function,permute.default,forward,15,1,1,1,2,4272,3 +1922,alias_default_450,call_function,alias.default,forward,15,1,1,2,1761,4270,4 +1923,alias_default_451,call_function,alias.default,forward,15,1,1,2,3,4271,3 +1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5 +1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10 +1926,dtype_cast_145,call_function,dtype_cast.default,forward,16,1,1,1,1,4257,2 +1927,alias_default_452,call_function,alias.default,forward,15,1,1,3,1768,4267,4 +1928,convert_element_type_384,call_function,convert_element_type.default,forward,16,1,1,1,1769,4265,4 +1929,alias_default_454,call_function,alias.default,forward,16,1,1,2,1770,4264,4 +1930,pow_33,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1771,4263,4 +1931,mean_32,call_function,mean.dim,forward,16,1,1,1,1772,4262,4 +1932,add_80,call_function,add.Scalar,forward,16,1,1,1,1773,4261,3 +1933,rsqrt_32,call_function,rsqrt.default,forward,16,1,1,1,1774,4260,3 +1934,alias_default_455,call_function,alias.default,forward,16,1,1,3,1775,4259,3 +1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8 +1936,alias_default_453,call_function,alias.default,forward,16,1,1,2,2,4256,2 +1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8 +1938,convert_element_type_385,call_function,convert_element_type.default,forward,16,1,1,1,1781,4253,6 +1939,dtype_cast_146,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3 +1940,permute_176,call_function,permute.default,forward,16,1,1,1,2,4239,3 +1941,alias_default_456,call_function,alias.default,forward,16,1,1,6,1782,4252,4 +1942,alias_default_457,call_function,alias.default,forward,16,1,1,2,3,4238,3 +1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 +1944,dtype_cast_147,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3 +1945,permute_177,call_function,permute.default,forward,16,1,1,1,2,4239,3 +1946,alias_default_458,call_function,alias.default,forward,16,1,1,2,3,4238,3 +1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 +1948,dtype_cast_148,call_function,dtype_cast.default,forward,16,1,1,1,1,4233,3 +1949,permute_178,call_function,permute.default,forward,16,1,1,1,2,4232,3 +1950,alias_default_459,call_function,alias.default,forward,16,1,1,2,3,4231,3 +1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5 +1952,view_374,call_function,view.default,forward,16,1,1,1,1788,4235,4 +1953,view_375,call_function,view.default,forward,16,1,1,1,1788,4235,4 +1954,view_376,call_function,view.default,forward,16,1,1,1,1788,4228,4 +1955,convert_element_type_392,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4 +1956,view_377,call_function,view.default,forward,16,1,1,1,1790,4233,4 +1957,view_as_complex_32,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6 +1958,convert_element_type_393,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4 +1959,view_378,call_function,view.default,forward,16,1,1,1,1790,4233,4 +1960,view_as_complex_33,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6 +1961,view_379,call_function,view.default,forward,16,1,1,1,2,4243,3 +1962,alias_default_460,call_function,alias.default,forward,16,1,1,4,3,4242,3 +1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 +1964,view_as_real_32,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6 +1965,view_380,call_function,view.default,forward,16,1,1,1,1796,4229,6 +1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 +1967,view_as_real_33,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6 +1968,view_381,call_function,view.default,forward,16,1,1,1,1796,4229,6 +1969,convert_element_type_394,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6 +1970,convert_element_type_395,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6 +1971,permute_179,call_function,permute.default,forward,16,1,1,1,1798,4227,6 +1972,permute_180,call_function,permute.default,forward,16,1,1,1,1798,4227,6 +1973,permute_181,call_function,permute.default,forward,16,1,1,1,1789,4227,4 +1974,alias_default_461,call_function,alias.default,forward,16,1,1,2,1799,4226,4 +1975,alias_default_462,call_function,alias.default,forward,16,1,1,2,1799,4226,4 +1976,alias_default_463,call_function,alias.default,forward,16,1,1,2,1790,4226,4 +1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2 +1978,getitem_144,call_function,getitem,forward,16,1,1,1,1824,4221,2 +1979,getitem_145,call_function,getitem,forward,16,1,1,1,1824,1824,2 +1980,getitem_150,call_function,getitem,forward,16,1,1,1,1824,1824,1 +1981,getitem_151,call_function,getitem,forward,16,1,1,1,1824,1824,1 +1982,alias_default_464,call_function,alias.default,forward,16,1,1,2,1825,4220,4 +1983,permute_182,call_function,permute.default,forward,16,1,1,1,1826,4219,4 +1984,view_382,call_function,view.default,forward,16,1,1,1,1827,4218,3 +1985,dtype_cast_149,call_function,dtype_cast.default,forward,16,1,1,1,1,4220,3 +1986,permute_183,call_function,permute.default,forward,16,1,1,1,2,4219,3 +1987,alias_default_465,call_function,alias.default,forward,16,1,1,2,1828,4217,4 +1988,alias_default_466,call_function,alias.default,forward,16,1,1,2,3,4218,3 +1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5 +1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10 +1991,dtype_cast_150,call_function,dtype_cast.default,forward,16,1,1,1,1,4204,2 +1992,alias_default_467,call_function,alias.default,forward,16,1,1,3,1835,4214,4 +1993,convert_element_type_398,call_function,convert_element_type.default,forward,16,1,1,1,1836,4212,4 +1994,alias_default_469,call_function,alias.default,forward,16,1,1,2,1837,4211,4 +1995,pow_34,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1838,4210,4 +1996,mean_33,call_function,mean.dim,forward,16,1,1,1,1839,4209,4 +1997,add_82,call_function,add.Scalar,forward,16,1,1,1,1840,4208,3 +1998,rsqrt_33,call_function,rsqrt.default,forward,16,1,1,1,1841,4207,3 +1999,alias_default_470,call_function,alias.default,forward,16,1,1,3,1842,4206,3 +2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8 +2001,alias_default_468,call_function,alias.default,forward,16,1,1,2,2,4203,2 +2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8 +2003,convert_element_type_399,call_function,convert_element_type.default,forward,16,1,1,1,1848,4200,6 +2004,dtype_cast_151,call_function,dtype_cast.default,forward,16,1,1,1,1,4200,3 +2005,permute_184,call_function,permute.default,forward,16,1,1,1,2,4199,3 +2006,alias_default_471,call_function,alias.default,forward,16,1,1,4,1849,4199,4 +2007,alias_default_472,call_function,alias.default,forward,16,1,1,2,3,4198,3 +2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5 +2009,alias_default_473,call_function,alias.default,forward,16,1,1,2,1855,4195,4 +2010,convert_element_type_402,call_function,convert_element_type.default,forward,16,1,1,1,1856,4183,4 +2011,alias_default_474,call_function,alias.default,forward,16,1,1,2,1857,4182,4 +2012,neg_16,call_function,neg.default,forward,16,1,1,1,1858,4181,8 +2013,exp_16,call_function,exp.default,forward,16,1,1,1,1859,4180,6 +2014,add_83,call_function,add.Tensor,forward,16,1,1,1,1860,4179,4 +2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6 +2016,convert_element_type_403,call_function,convert_element_type.default,forward,16,1,1,1,1862,4177,6 +2017,dtype_cast_152,call_function,dtype_cast.default,forward,16,1,1,1,1,4181,3 +2018,permute_185,call_function,permute.default,forward,16,1,1,1,2,4180,3 +2019,alias_default_476,call_function,alias.default,forward,16,1,1,2,3,4179,3 +2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5 +2021,alias_default_475,call_function,alias.default,forward,16,1,1,2,1863,4176,4 +2022,alias_default_477,call_function,alias.default,forward,16,1,1,2,1855,4176,4 +2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8 +2024,dtype_cast_153,call_function,dtype_cast.default,forward,16,1,1,1,1,4177,3 +2025,permute_186,call_function,permute.default,forward,16,1,1,1,2,4176,3 +2026,alias_default_478,call_function,alias.default,forward,16,1,1,2,1871,4174,4 +2027,alias_default_479,call_function,alias.default,forward,16,1,1,2,3,4175,3 +2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5 +2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10 +2030,dtype_cast_154,call_function,dtype_cast.default,forward,17,1,1,1,1,4161,2 +2031,alias_default_480,call_function,alias.default,forward,16,1,1,3,1878,4171,4 +2032,convert_element_type_408,call_function,convert_element_type.default,forward,17,1,1,1,1879,4169,4 +2033,alias_default_482,call_function,alias.default,forward,17,1,1,2,1880,4168,4 +2034,pow_35,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1881,4167,4 +2035,mean_34,call_function,mean.dim,forward,17,1,1,1,1882,4166,4 +2036,add_85,call_function,add.Scalar,forward,17,1,1,1,1883,4165,3 +2037,rsqrt_34,call_function,rsqrt.default,forward,17,1,1,1,1884,4164,3 +2038,alias_default_483,call_function,alias.default,forward,17,1,1,3,1885,4163,3 +2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8 +2040,alias_default_481,call_function,alias.default,forward,17,1,1,2,2,4160,2 +2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8 +2042,convert_element_type_409,call_function,convert_element_type.default,forward,17,1,1,1,1891,4157,6 +2043,dtype_cast_155,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3 +2044,permute_187,call_function,permute.default,forward,17,1,1,1,2,4143,3 +2045,alias_default_484,call_function,alias.default,forward,17,1,1,6,1892,4156,4 +2046,alias_default_485,call_function,alias.default,forward,17,1,1,2,3,4142,3 +2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 +2048,dtype_cast_156,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3 +2049,permute_188,call_function,permute.default,forward,17,1,1,1,2,4143,3 +2050,alias_default_486,call_function,alias.default,forward,17,1,1,2,3,4142,3 +2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 +2052,dtype_cast_157,call_function,dtype_cast.default,forward,17,1,1,1,1,4137,3 +2053,permute_189,call_function,permute.default,forward,17,1,1,1,2,4136,3 +2054,alias_default_487,call_function,alias.default,forward,17,1,1,2,3,4135,3 +2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5 +2056,view_397,call_function,view.default,forward,17,1,1,1,1898,4139,4 +2057,view_398,call_function,view.default,forward,17,1,1,1,1898,4139,4 +2058,view_399,call_function,view.default,forward,17,1,1,1,1898,4132,4 +2059,convert_element_type_416,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4 +2060,view_400,call_function,view.default,forward,17,1,1,1,1900,4137,4 +2061,view_as_complex_34,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6 +2062,convert_element_type_417,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4 +2063,view_401,call_function,view.default,forward,17,1,1,1,1900,4137,4 +2064,view_as_complex_35,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6 +2065,view_402,call_function,view.default,forward,17,1,1,1,2,4147,3 +2066,alias_default_488,call_function,alias.default,forward,17,1,1,4,3,4146,3 +2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 +2068,view_as_real_34,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6 +2069,view_403,call_function,view.default,forward,17,1,1,1,1906,4133,6 +2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 +2071,view_as_real_35,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6 +2072,view_404,call_function,view.default,forward,17,1,1,1,1906,4133,6 +2073,convert_element_type_418,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6 +2074,convert_element_type_419,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6 +2075,permute_190,call_function,permute.default,forward,17,1,1,1,1908,4131,6 +2076,permute_191,call_function,permute.default,forward,17,1,1,1,1908,4131,6 +2077,permute_192,call_function,permute.default,forward,17,1,1,1,1899,4131,4 +2078,alias_default_489,call_function,alias.default,forward,17,1,1,2,1909,4130,4 +2079,alias_default_490,call_function,alias.default,forward,17,1,1,2,1909,4130,4 +2080,alias_default_491,call_function,alias.default,forward,17,1,1,2,1900,4130,4 +2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2 +2082,getitem_153,call_function,getitem,forward,17,1,1,1,1934,4125,2 +2083,getitem_154,call_function,getitem,forward,17,1,1,1,1934,1934,2 +2084,getitem_159,call_function,getitem,forward,17,1,1,1,1934,1934,1 +2085,getitem_160,call_function,getitem,forward,17,1,1,1,1934,1934,1 +2086,alias_default_492,call_function,alias.default,forward,17,1,1,2,1935,4124,4 +2087,permute_193,call_function,permute.default,forward,17,1,1,1,1936,4123,4 +2088,view_405,call_function,view.default,forward,17,1,1,1,1937,4122,3 +2089,dtype_cast_158,call_function,dtype_cast.default,forward,17,1,1,1,1,4124,3 +2090,permute_194,call_function,permute.default,forward,17,1,1,1,2,4123,3 +2091,alias_default_493,call_function,alias.default,forward,17,1,1,2,1938,4121,4 +2092,alias_default_494,call_function,alias.default,forward,17,1,1,2,3,4122,3 +2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5 +2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10 +2095,dtype_cast_159,call_function,dtype_cast.default,forward,17,1,1,1,1,4108,2 +2096,alias_default_495,call_function,alias.default,forward,17,1,1,3,1945,4118,4 +2097,convert_element_type_422,call_function,convert_element_type.default,forward,17,1,1,1,1946,4116,4 +2098,alias_default_497,call_function,alias.default,forward,17,1,1,2,1947,4115,4 +2099,pow_36,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1948,4114,4 +2100,mean_35,call_function,mean.dim,forward,17,1,1,1,1949,4113,4 +2101,add_87,call_function,add.Scalar,forward,17,1,1,1,1950,4112,3 +2102,rsqrt_35,call_function,rsqrt.default,forward,17,1,1,1,1951,4111,3 +2103,alias_default_498,call_function,alias.default,forward,17,1,1,3,1952,4110,3 +2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8 +2105,alias_default_496,call_function,alias.default,forward,17,1,1,2,2,4107,2 +2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8 +2107,convert_element_type_423,call_function,convert_element_type.default,forward,17,1,1,1,1958,4104,6 +2108,dtype_cast_160,call_function,dtype_cast.default,forward,17,1,1,1,1,4104,3 +2109,permute_195,call_function,permute.default,forward,17,1,1,1,2,4103,3 +2110,alias_default_499,call_function,alias.default,forward,17,1,1,4,1959,4103,4 +2111,alias_default_500,call_function,alias.default,forward,17,1,1,2,3,4102,3 +2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5 +2113,alias_default_501,call_function,alias.default,forward,17,1,1,2,1965,4099,4 +2114,convert_element_type_426,call_function,convert_element_type.default,forward,17,1,1,1,1966,4087,4 +2115,alias_default_502,call_function,alias.default,forward,17,1,1,2,1967,4086,4 +2116,neg_17,call_function,neg.default,forward,17,1,1,1,1968,4085,8 +2117,exp_17,call_function,exp.default,forward,17,1,1,1,1969,4084,6 +2118,add_88,call_function,add.Tensor,forward,17,1,1,1,1970,4083,4 +2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6 +2120,convert_element_type_427,call_function,convert_element_type.default,forward,17,1,1,1,1972,4081,6 +2121,dtype_cast_161,call_function,dtype_cast.default,forward,17,1,1,1,1,4085,3 +2122,permute_196,call_function,permute.default,forward,17,1,1,1,2,4084,3 +2123,alias_default_504,call_function,alias.default,forward,17,1,1,2,3,4083,3 +2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5 +2125,alias_default_503,call_function,alias.default,forward,17,1,1,2,1973,4080,4 +2126,alias_default_505,call_function,alias.default,forward,17,1,1,2,1965,4080,4 +2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8 +2128,dtype_cast_162,call_function,dtype_cast.default,forward,17,1,1,1,1,4081,3 +2129,permute_197,call_function,permute.default,forward,17,1,1,1,2,4080,3 +2130,alias_default_506,call_function,alias.default,forward,17,1,1,2,1981,4078,4 +2131,alias_default_507,call_function,alias.default,forward,17,1,1,2,3,4079,3 +2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5 +2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10 +2134,dtype_cast_163,call_function,dtype_cast.default,forward,18,1,1,1,1,4065,2 +2135,alias_default_508,call_function,alias.default,forward,17,1,1,3,1988,4075,4 +2136,convert_element_type_432,call_function,convert_element_type.default,forward,18,1,1,1,1989,4073,4 +2137,alias_default_510,call_function,alias.default,forward,18,1,1,2,1990,4072,4 +2138,pow_37,call_function,pow.Tensor_Scalar,forward,18,1,1,1,1991,4071,4 +2139,mean_36,call_function,mean.dim,forward,18,1,1,1,1992,4070,4 +2140,add_90,call_function,add.Scalar,forward,18,1,1,1,1993,4069,3 +2141,rsqrt_36,call_function,rsqrt.default,forward,18,1,1,1,1994,4068,3 +2142,alias_default_511,call_function,alias.default,forward,18,1,1,3,1995,4067,3 +2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8 +2144,alias_default_509,call_function,alias.default,forward,18,1,1,2,2,4064,2 +2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8 +2146,convert_element_type_433,call_function,convert_element_type.default,forward,18,1,1,1,2001,4061,6 +2147,dtype_cast_164,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3 +2148,permute_198,call_function,permute.default,forward,18,1,1,1,2,4047,3 +2149,alias_default_512,call_function,alias.default,forward,18,1,1,6,2002,4060,4 +2150,alias_default_513,call_function,alias.default,forward,18,1,1,2,3,4046,3 +2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 +2152,dtype_cast_165,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3 +2153,permute_199,call_function,permute.default,forward,18,1,1,1,2,4047,3 +2154,alias_default_514,call_function,alias.default,forward,18,1,1,2,3,4046,3 +2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 +2156,dtype_cast_166,call_function,dtype_cast.default,forward,18,1,1,1,1,4041,3 +2157,permute_200,call_function,permute.default,forward,18,1,1,1,2,4040,3 +2158,alias_default_515,call_function,alias.default,forward,18,1,1,2,3,4039,3 +2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5 +2160,view_420,call_function,view.default,forward,18,1,1,1,2008,4043,4 +2161,view_421,call_function,view.default,forward,18,1,1,1,2008,4043,4 +2162,view_422,call_function,view.default,forward,18,1,1,1,2008,4036,4 +2163,convert_element_type_440,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4 +2164,view_423,call_function,view.default,forward,18,1,1,1,2010,4041,4 +2165,view_as_complex_36,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6 +2166,convert_element_type_441,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4 +2167,view_424,call_function,view.default,forward,18,1,1,1,2010,4041,4 +2168,view_as_complex_37,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6 +2169,view_425,call_function,view.default,forward,18,1,1,1,2,4051,3 +2170,alias_default_516,call_function,alias.default,forward,18,1,1,4,3,4050,3 +2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 +2172,view_as_real_36,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6 +2173,view_426,call_function,view.default,forward,18,1,1,1,2016,4037,6 +2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 +2175,view_as_real_37,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6 +2176,view_427,call_function,view.default,forward,18,1,1,1,2016,4037,6 +2177,convert_element_type_442,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6 +2178,convert_element_type_443,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6 +2179,permute_201,call_function,permute.default,forward,18,1,1,1,2018,4035,6 +2180,permute_202,call_function,permute.default,forward,18,1,1,1,2018,4035,6 +2181,permute_203,call_function,permute.default,forward,18,1,1,1,2009,4035,4 +2182,alias_default_517,call_function,alias.default,forward,18,1,1,2,2019,4034,4 +2183,alias_default_518,call_function,alias.default,forward,18,1,1,2,2019,4034,4 +2184,alias_default_519,call_function,alias.default,forward,18,1,1,2,2010,4034,4 +2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2 +2186,getitem_162,call_function,getitem,forward,18,1,1,1,2044,4029,2 +2187,getitem_163,call_function,getitem,forward,18,1,1,1,2044,2044,2 +2188,getitem_168,call_function,getitem,forward,18,1,1,1,2044,2044,1 +2189,getitem_169,call_function,getitem,forward,18,1,1,1,2044,2044,1 +2190,alias_default_520,call_function,alias.default,forward,18,1,1,2,2045,4028,4 +2191,permute_204,call_function,permute.default,forward,18,1,1,1,2046,4027,4 +2192,view_428,call_function,view.default,forward,18,1,1,1,2047,4026,3 +2193,dtype_cast_167,call_function,dtype_cast.default,forward,18,1,1,1,1,4028,3 +2194,permute_205,call_function,permute.default,forward,18,1,1,1,2,4027,3 +2195,alias_default_521,call_function,alias.default,forward,18,1,1,2,2048,4025,4 +2196,alias_default_522,call_function,alias.default,forward,18,1,1,2,3,4026,3 +2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5 +2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10 +2199,dtype_cast_168,call_function,dtype_cast.default,forward,18,1,1,1,1,4012,2 +2200,alias_default_523,call_function,alias.default,forward,18,1,1,3,2055,4022,4 +2201,convert_element_type_446,call_function,convert_element_type.default,forward,18,1,1,1,2056,4020,4 +2202,alias_default_525,call_function,alias.default,forward,18,1,1,2,2057,4019,4 +2203,pow_38,call_function,pow.Tensor_Scalar,forward,18,1,1,1,2058,4018,4 +2204,mean_37,call_function,mean.dim,forward,18,1,1,1,2059,4017,4 +2205,add_92,call_function,add.Scalar,forward,18,1,1,1,2060,4016,3 +2206,rsqrt_37,call_function,rsqrt.default,forward,18,1,1,1,2061,4015,3 +2207,alias_default_526,call_function,alias.default,forward,18,1,1,3,2062,4014,3 +2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8 +2209,alias_default_524,call_function,alias.default,forward,18,1,1,2,2,4011,2 +2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8 +2211,convert_element_type_447,call_function,convert_element_type.default,forward,18,1,1,1,2068,4008,6 +2212,dtype_cast_169,call_function,dtype_cast.default,forward,18,1,1,1,1,4008,3 +2213,permute_206,call_function,permute.default,forward,18,1,1,1,2,4007,3 +2214,alias_default_527,call_function,alias.default,forward,18,1,1,4,2069,4007,4 +2215,alias_default_528,call_function,alias.default,forward,18,1,1,2,3,4006,3 +2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5 +2217,alias_default_529,call_function,alias.default,forward,18,1,1,2,2075,4003,4 +2218,convert_element_type_450,call_function,convert_element_type.default,forward,18,1,1,1,2076,3991,4 +2219,alias_default_530,call_function,alias.default,forward,18,1,1,2,2077,3990,4 +2220,neg_18,call_function,neg.default,forward,18,1,1,1,2078,3989,8 +2221,exp_18,call_function,exp.default,forward,18,1,1,1,2079,3988,6 +2222,add_93,call_function,add.Tensor,forward,18,1,1,1,2080,3987,4 +2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6 +2224,convert_element_type_451,call_function,convert_element_type.default,forward,18,1,1,1,2082,3985,6 +2225,dtype_cast_170,call_function,dtype_cast.default,forward,18,1,1,1,1,3989,3 +2226,permute_207,call_function,permute.default,forward,18,1,1,1,2,3988,3 +2227,alias_default_532,call_function,alias.default,forward,18,1,1,2,3,3987,3 +2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5 +2229,alias_default_531,call_function,alias.default,forward,18,1,1,2,2083,3984,4 +2230,alias_default_533,call_function,alias.default,forward,18,1,1,2,2075,3984,4 +2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8 +2232,dtype_cast_171,call_function,dtype_cast.default,forward,18,1,1,1,1,3985,3 +2233,permute_208,call_function,permute.default,forward,18,1,1,1,2,3984,3 +2234,alias_default_534,call_function,alias.default,forward,18,1,1,2,2091,3982,4 +2235,alias_default_535,call_function,alias.default,forward,18,1,1,2,3,3983,3 +2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5 +2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10 +2238,dtype_cast_172,call_function,dtype_cast.default,forward,19,1,1,1,1,3969,2 +2239,alias_default_536,call_function,alias.default,forward,18,1,1,3,2098,3979,4 +2240,convert_element_type_456,call_function,convert_element_type.default,forward,19,1,1,1,2099,3977,4 +2241,alias_default_538,call_function,alias.default,forward,19,1,1,2,2100,3976,4 +2242,pow_39,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2101,3975,4 +2243,mean_38,call_function,mean.dim,forward,19,1,1,1,2102,3974,4 +2244,add_95,call_function,add.Scalar,forward,19,1,1,1,2103,3973,3 +2245,rsqrt_38,call_function,rsqrt.default,forward,19,1,1,1,2104,3972,3 +2246,alias_default_539,call_function,alias.default,forward,19,1,1,3,2105,3971,3 +2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8 +2248,alias_default_537,call_function,alias.default,forward,19,1,1,2,2,3968,2 +2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8 +2250,convert_element_type_457,call_function,convert_element_type.default,forward,19,1,1,1,2111,3965,6 +2251,dtype_cast_173,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3 +2252,permute_209,call_function,permute.default,forward,19,1,1,1,2,3951,3 +2253,alias_default_540,call_function,alias.default,forward,19,1,1,6,2112,3964,4 +2254,alias_default_541,call_function,alias.default,forward,19,1,1,2,3,3950,3 +2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 +2256,dtype_cast_174,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3 +2257,permute_210,call_function,permute.default,forward,19,1,1,1,2,3951,3 +2258,alias_default_542,call_function,alias.default,forward,19,1,1,2,3,3950,3 +2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 +2260,dtype_cast_175,call_function,dtype_cast.default,forward,19,1,1,1,1,3945,3 +2261,permute_211,call_function,permute.default,forward,19,1,1,1,2,3944,3 +2262,alias_default_543,call_function,alias.default,forward,19,1,1,2,3,3943,3 +2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5 +2264,view_443,call_function,view.default,forward,19,1,1,1,2118,3947,4 +2265,view_444,call_function,view.default,forward,19,1,1,1,2118,3947,4 +2266,view_445,call_function,view.default,forward,19,1,1,1,2118,3940,4 +2267,convert_element_type_464,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4 +2268,view_446,call_function,view.default,forward,19,1,1,1,2120,3945,4 +2269,view_as_complex_38,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6 +2270,convert_element_type_465,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4 +2271,view_447,call_function,view.default,forward,19,1,1,1,2120,3945,4 +2272,view_as_complex_39,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6 +2273,view_448,call_function,view.default,forward,19,1,1,1,2,3955,3 +2274,alias_default_544,call_function,alias.default,forward,19,1,1,4,3,3954,3 +2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 +2276,view_as_real_38,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6 +2277,view_449,call_function,view.default,forward,19,1,1,1,2126,3941,6 +2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 +2279,view_as_real_39,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6 +2280,view_450,call_function,view.default,forward,19,1,1,1,2126,3941,6 +2281,convert_element_type_466,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6 +2282,convert_element_type_467,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6 +2283,permute_212,call_function,permute.default,forward,19,1,1,1,2128,3939,6 +2284,permute_213,call_function,permute.default,forward,19,1,1,1,2128,3939,6 +2285,permute_214,call_function,permute.default,forward,19,1,1,1,2119,3939,4 +2286,alias_default_545,call_function,alias.default,forward,19,1,1,2,2129,3938,4 +2287,alias_default_546,call_function,alias.default,forward,19,1,1,2,2129,3938,4 +2288,alias_default_547,call_function,alias.default,forward,19,1,1,2,2120,3938,4 +2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2 +2290,getitem_171,call_function,getitem,forward,19,1,1,1,2154,3933,2 +2291,getitem_172,call_function,getitem,forward,19,1,1,1,2154,2154,2 +2292,getitem_177,call_function,getitem,forward,19,1,1,1,2154,2154,1 +2293,getitem_178,call_function,getitem,forward,19,1,1,1,2154,2154,1 +2294,alias_default_548,call_function,alias.default,forward,19,1,1,2,2155,3932,4 +2295,permute_215,call_function,permute.default,forward,19,1,1,1,2156,3931,4 +2296,view_451,call_function,view.default,forward,19,1,1,1,2157,3930,3 +2297,dtype_cast_176,call_function,dtype_cast.default,forward,19,1,1,1,1,3932,3 +2298,permute_216,call_function,permute.default,forward,19,1,1,1,2,3931,3 +2299,alias_default_549,call_function,alias.default,forward,19,1,1,2,2158,3929,4 +2300,alias_default_550,call_function,alias.default,forward,19,1,1,2,3,3930,3 +2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5 +2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10 +2303,dtype_cast_177,call_function,dtype_cast.default,forward,19,1,1,1,1,3916,2 +2304,alias_default_551,call_function,alias.default,forward,19,1,1,3,2165,3926,4 +2305,convert_element_type_470,call_function,convert_element_type.default,forward,19,1,1,1,2166,3924,4 +2306,alias_default_553,call_function,alias.default,forward,19,1,1,2,2167,3923,4 +2307,pow_40,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2168,3922,4 +2308,mean_39,call_function,mean.dim,forward,19,1,1,1,2169,3921,4 +2309,add_97,call_function,add.Scalar,forward,19,1,1,1,2170,3920,3 +2310,rsqrt_39,call_function,rsqrt.default,forward,19,1,1,1,2171,3919,3 +2311,alias_default_554,call_function,alias.default,forward,19,1,1,3,2172,3918,3 +2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8 +2313,alias_default_552,call_function,alias.default,forward,19,1,1,2,2,3915,2 +2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8 +2315,convert_element_type_471,call_function,convert_element_type.default,forward,19,1,1,1,2178,3912,6 +2316,dtype_cast_178,call_function,dtype_cast.default,forward,19,1,1,1,1,3912,3 +2317,permute_217,call_function,permute.default,forward,19,1,1,1,2,3911,3 +2318,alias_default_555,call_function,alias.default,forward,19,1,1,4,2179,3911,4 +2319,alias_default_556,call_function,alias.default,forward,19,1,1,2,3,3910,3 +2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5 +2321,alias_default_557,call_function,alias.default,forward,19,1,1,2,2185,3907,4 +2322,convert_element_type_474,call_function,convert_element_type.default,forward,19,1,1,1,2186,3895,4 +2323,alias_default_558,call_function,alias.default,forward,19,1,1,2,2187,3894,4 +2324,neg_19,call_function,neg.default,forward,19,1,1,1,2188,3893,8 +2325,exp_19,call_function,exp.default,forward,19,1,1,1,2189,3892,6 +2326,add_98,call_function,add.Tensor,forward,19,1,1,1,2190,3891,4 +2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6 +2328,convert_element_type_475,call_function,convert_element_type.default,forward,19,1,1,1,2192,3889,6 +2329,dtype_cast_179,call_function,dtype_cast.default,forward,19,1,1,1,1,3893,3 +2330,permute_218,call_function,permute.default,forward,19,1,1,1,2,3892,3 +2331,alias_default_560,call_function,alias.default,forward,19,1,1,2,3,3891,3 +2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5 +2333,alias_default_559,call_function,alias.default,forward,19,1,1,2,2193,3888,4 +2334,alias_default_561,call_function,alias.default,forward,19,1,1,2,2185,3888,4 +2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8 +2336,dtype_cast_180,call_function,dtype_cast.default,forward,19,1,1,1,1,3889,3 +2337,permute_219,call_function,permute.default,forward,19,1,1,1,2,3888,3 +2338,alias_default_562,call_function,alias.default,forward,19,1,1,2,2201,3886,4 +2339,alias_default_563,call_function,alias.default,forward,19,1,1,2,3,3887,3 +2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5 +2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10 +2342,dtype_cast_181,call_function,dtype_cast.default,forward,20,1,1,1,1,3873,2 +2343,alias_default_564,call_function,alias.default,forward,19,1,1,3,2208,3883,4 +2344,convert_element_type_480,call_function,convert_element_type.default,forward,20,1,1,1,2209,3881,4 +2345,alias_default_566,call_function,alias.default,forward,20,1,1,2,2210,3880,4 +2346,pow_41,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2211,3879,4 +2347,mean_40,call_function,mean.dim,forward,20,1,1,1,2212,3878,4 +2348,add_100,call_function,add.Scalar,forward,20,1,1,1,2213,3877,3 +2349,rsqrt_40,call_function,rsqrt.default,forward,20,1,1,1,2214,3876,3 +2350,alias_default_567,call_function,alias.default,forward,20,1,1,3,2215,3875,3 +2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8 +2352,alias_default_565,call_function,alias.default,forward,20,1,1,2,2,3872,2 +2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8 +2354,convert_element_type_481,call_function,convert_element_type.default,forward,20,1,1,1,2221,3869,6 +2355,dtype_cast_182,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3 +2356,permute_220,call_function,permute.default,forward,20,1,1,1,2,3855,3 +2357,alias_default_568,call_function,alias.default,forward,20,1,1,6,2222,3868,4 +2358,alias_default_569,call_function,alias.default,forward,20,1,1,2,3,3854,3 +2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 +2360,dtype_cast_183,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3 +2361,permute_221,call_function,permute.default,forward,20,1,1,1,2,3855,3 +2362,alias_default_570,call_function,alias.default,forward,20,1,1,2,3,3854,3 +2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 +2364,dtype_cast_184,call_function,dtype_cast.default,forward,20,1,1,1,1,3849,3 +2365,permute_222,call_function,permute.default,forward,20,1,1,1,2,3848,3 +2366,alias_default_571,call_function,alias.default,forward,20,1,1,2,3,3847,3 +2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5 +2368,view_466,call_function,view.default,forward,20,1,1,1,2228,3851,4 +2369,view_467,call_function,view.default,forward,20,1,1,1,2228,3851,4 +2370,view_468,call_function,view.default,forward,20,1,1,1,2228,3844,4 +2371,convert_element_type_488,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4 +2372,view_469,call_function,view.default,forward,20,1,1,1,2230,3849,4 +2373,view_as_complex_40,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6 +2374,convert_element_type_489,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4 +2375,view_470,call_function,view.default,forward,20,1,1,1,2230,3849,4 +2376,view_as_complex_41,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6 +2377,view_471,call_function,view.default,forward,20,1,1,1,2,3859,3 +2378,alias_default_572,call_function,alias.default,forward,20,1,1,4,3,3858,3 +2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 +2380,view_as_real_40,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6 +2381,view_472,call_function,view.default,forward,20,1,1,1,2236,3845,6 +2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 +2383,view_as_real_41,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6 +2384,view_473,call_function,view.default,forward,20,1,1,1,2236,3845,6 +2385,convert_element_type_490,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6 +2386,convert_element_type_491,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6 +2387,permute_223,call_function,permute.default,forward,20,1,1,1,2238,3843,6 +2388,permute_224,call_function,permute.default,forward,20,1,1,1,2238,3843,6 +2389,permute_225,call_function,permute.default,forward,20,1,1,1,2229,3843,4 +2390,alias_default_573,call_function,alias.default,forward,20,1,1,2,2239,3842,4 +2391,alias_default_574,call_function,alias.default,forward,20,1,1,2,2239,3842,4 +2392,alias_default_575,call_function,alias.default,forward,20,1,1,2,2230,3842,4 +2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2 +2394,getitem_180,call_function,getitem,forward,20,1,1,1,2264,3837,2 +2395,getitem_181,call_function,getitem,forward,20,1,1,1,2264,2264,2 +2396,getitem_186,call_function,getitem,forward,20,1,1,1,2264,2264,1 +2397,getitem_187,call_function,getitem,forward,20,1,1,1,2264,2264,1 +2398,alias_default_576,call_function,alias.default,forward,20,1,1,2,2265,3836,4 +2399,permute_226,call_function,permute.default,forward,20,1,1,1,2266,3835,4 +2400,view_474,call_function,view.default,forward,20,1,1,1,2267,3834,3 +2401,dtype_cast_185,call_function,dtype_cast.default,forward,20,1,1,1,1,3836,3 +2402,permute_227,call_function,permute.default,forward,20,1,1,1,2,3835,3 +2403,alias_default_577,call_function,alias.default,forward,20,1,1,2,2268,3833,4 +2404,alias_default_578,call_function,alias.default,forward,20,1,1,2,3,3834,3 +2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5 +2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10 +2407,dtype_cast_186,call_function,dtype_cast.default,forward,20,1,1,1,1,3820,2 +2408,alias_default_579,call_function,alias.default,forward,20,1,1,3,2275,3830,4 +2409,convert_element_type_494,call_function,convert_element_type.default,forward,20,1,1,1,2276,3828,4 +2410,alias_default_581,call_function,alias.default,forward,20,1,1,2,2277,3827,4 +2411,pow_42,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2278,3826,4 +2412,mean_41,call_function,mean.dim,forward,20,1,1,1,2279,3825,4 +2413,add_102,call_function,add.Scalar,forward,20,1,1,1,2280,3824,3 +2414,rsqrt_41,call_function,rsqrt.default,forward,20,1,1,1,2281,3823,3 +2415,alias_default_582,call_function,alias.default,forward,20,1,1,3,2282,3822,3 +2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8 +2417,alias_default_580,call_function,alias.default,forward,20,1,1,2,2,3819,2 +2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8 +2419,convert_element_type_495,call_function,convert_element_type.default,forward,20,1,1,1,2288,3816,6 +2420,dtype_cast_187,call_function,dtype_cast.default,forward,20,1,1,1,1,3816,3 +2421,permute_228,call_function,permute.default,forward,20,1,1,1,2,3815,3 +2422,alias_default_583,call_function,alias.default,forward,20,1,1,4,2289,3815,4 +2423,alias_default_584,call_function,alias.default,forward,20,1,1,2,3,3814,3 +2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5 +2425,alias_default_585,call_function,alias.default,forward,20,1,1,2,2295,3811,4 +2426,convert_element_type_498,call_function,convert_element_type.default,forward,20,1,1,1,2296,3799,4 +2427,alias_default_586,call_function,alias.default,forward,20,1,1,2,2297,3798,4 +2428,neg_20,call_function,neg.default,forward,20,1,1,1,2298,3797,8 +2429,exp_20,call_function,exp.default,forward,20,1,1,1,2299,3796,6 +2430,add_103,call_function,add.Tensor,forward,20,1,1,1,2300,3795,4 +2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6 +2432,convert_element_type_499,call_function,convert_element_type.default,forward,20,1,1,1,2302,3793,6 +2433,dtype_cast_188,call_function,dtype_cast.default,forward,20,1,1,1,1,3797,3 +2434,permute_229,call_function,permute.default,forward,20,1,1,1,2,3796,3 +2435,alias_default_588,call_function,alias.default,forward,20,1,1,2,3,3795,3 +2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5 +2437,alias_default_587,call_function,alias.default,forward,20,1,1,2,2303,3792,4 +2438,alias_default_589,call_function,alias.default,forward,20,1,1,2,2295,3792,4 +2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8 +2440,dtype_cast_189,call_function,dtype_cast.default,forward,20,1,1,1,1,3793,3 +2441,permute_230,call_function,permute.default,forward,20,1,1,1,2,3792,3 +2442,alias_default_590,call_function,alias.default,forward,20,1,1,2,2311,3790,4 +2443,alias_default_591,call_function,alias.default,forward,20,1,1,2,3,3791,3 +2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5 +2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10 +2446,dtype_cast_190,call_function,dtype_cast.default,forward,21,1,1,1,1,3777,2 +2447,alias_default_592,call_function,alias.default,forward,20,1,1,3,2318,3787,4 +2448,convert_element_type_504,call_function,convert_element_type.default,forward,21,1,1,1,2319,3785,4 +2449,alias_default_594,call_function,alias.default,forward,21,1,1,2,2320,3784,4 +2450,pow_43,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2321,3783,4 +2451,mean_42,call_function,mean.dim,forward,21,1,1,1,2322,3782,4 +2452,add_105,call_function,add.Scalar,forward,21,1,1,1,2323,3781,3 +2453,rsqrt_42,call_function,rsqrt.default,forward,21,1,1,1,2324,3780,3 +2454,alias_default_595,call_function,alias.default,forward,21,1,1,3,2325,3779,3 +2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8 +2456,alias_default_593,call_function,alias.default,forward,21,1,1,2,2,3776,2 +2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8 +2458,convert_element_type_505,call_function,convert_element_type.default,forward,21,1,1,1,2331,3773,6 +2459,dtype_cast_191,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3 +2460,permute_231,call_function,permute.default,forward,21,1,1,1,2,3759,3 +2461,alias_default_596,call_function,alias.default,forward,21,1,1,6,2332,3772,4 +2462,alias_default_597,call_function,alias.default,forward,21,1,1,2,3,3758,3 +2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 +2464,dtype_cast_192,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3 +2465,permute_232,call_function,permute.default,forward,21,1,1,1,2,3759,3 +2466,alias_default_598,call_function,alias.default,forward,21,1,1,2,3,3758,3 +2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 +2468,dtype_cast_193,call_function,dtype_cast.default,forward,21,1,1,1,1,3753,3 +2469,permute_233,call_function,permute.default,forward,21,1,1,1,2,3752,3 +2470,alias_default_599,call_function,alias.default,forward,21,1,1,2,3,3751,3 +2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5 +2472,view_489,call_function,view.default,forward,21,1,1,1,2338,3755,4 +2473,view_490,call_function,view.default,forward,21,1,1,1,2338,3755,4 +2474,view_491,call_function,view.default,forward,21,1,1,1,2338,3748,4 +2475,convert_element_type_512,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4 +2476,view_492,call_function,view.default,forward,21,1,1,1,2340,3753,4 +2477,view_as_complex_42,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6 +2478,convert_element_type_513,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4 +2479,view_493,call_function,view.default,forward,21,1,1,1,2340,3753,4 +2480,view_as_complex_43,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6 +2481,view_494,call_function,view.default,forward,21,1,1,1,2,3763,3 +2482,alias_default_600,call_function,alias.default,forward,21,1,1,4,3,3762,3 +2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 +2484,view_as_real_42,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6 +2485,view_495,call_function,view.default,forward,21,1,1,1,2346,3749,6 +2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 +2487,view_as_real_43,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6 +2488,view_496,call_function,view.default,forward,21,1,1,1,2346,3749,6 +2489,convert_element_type_514,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6 +2490,convert_element_type_515,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6 +2491,permute_234,call_function,permute.default,forward,21,1,1,1,2348,3747,6 +2492,permute_235,call_function,permute.default,forward,21,1,1,1,2348,3747,6 +2493,permute_236,call_function,permute.default,forward,21,1,1,1,2339,3747,4 +2494,alias_default_601,call_function,alias.default,forward,21,1,1,2,2349,3746,4 +2495,alias_default_602,call_function,alias.default,forward,21,1,1,2,2349,3746,4 +2496,alias_default_603,call_function,alias.default,forward,21,1,1,2,2340,3746,4 +2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2 +2498,getitem_189,call_function,getitem,forward,21,1,1,1,2374,3741,2 +2499,getitem_190,call_function,getitem,forward,21,1,1,1,2374,2374,2 +2500,getitem_195,call_function,getitem,forward,21,1,1,1,2374,2374,1 +2501,getitem_196,call_function,getitem,forward,21,1,1,1,2374,2374,1 +2502,alias_default_604,call_function,alias.default,forward,21,1,1,2,2375,3740,4 +2503,permute_237,call_function,permute.default,forward,21,1,1,1,2376,3739,4 +2504,view_497,call_function,view.default,forward,21,1,1,1,2377,3738,3 +2505,dtype_cast_194,call_function,dtype_cast.default,forward,21,1,1,1,1,3740,3 +2506,permute_238,call_function,permute.default,forward,21,1,1,1,2,3739,3 +2507,alias_default_605,call_function,alias.default,forward,21,1,1,2,2378,3737,4 +2508,alias_default_606,call_function,alias.default,forward,21,1,1,2,3,3738,3 +2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5 +2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10 +2511,dtype_cast_195,call_function,dtype_cast.default,forward,21,1,1,1,1,3724,2 +2512,alias_default_607,call_function,alias.default,forward,21,1,1,3,2385,3734,4 +2513,convert_element_type_518,call_function,convert_element_type.default,forward,21,1,1,1,2386,3732,4 +2514,alias_default_609,call_function,alias.default,forward,21,1,1,2,2387,3731,4 +2515,pow_44,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2388,3730,4 +2516,mean_43,call_function,mean.dim,forward,21,1,1,1,2389,3729,4 +2517,add_107,call_function,add.Scalar,forward,21,1,1,1,2390,3728,3 +2518,rsqrt_43,call_function,rsqrt.default,forward,21,1,1,1,2391,3727,3 +2519,alias_default_610,call_function,alias.default,forward,21,1,1,3,2392,3726,3 +2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8 +2521,alias_default_608,call_function,alias.default,forward,21,1,1,2,2,3723,2 +2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8 +2523,convert_element_type_519,call_function,convert_element_type.default,forward,21,1,1,1,2398,3720,6 +2524,dtype_cast_196,call_function,dtype_cast.default,forward,21,1,1,1,1,3720,3 +2525,permute_239,call_function,permute.default,forward,21,1,1,1,2,3719,3 +2526,alias_default_611,call_function,alias.default,forward,21,1,1,4,2399,3719,4 +2527,alias_default_612,call_function,alias.default,forward,21,1,1,2,3,3718,3 +2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5 +2529,alias_default_613,call_function,alias.default,forward,21,1,1,2,2405,3715,4 +2530,convert_element_type_522,call_function,convert_element_type.default,forward,21,1,1,1,2406,3703,4 +2531,alias_default_614,call_function,alias.default,forward,21,1,1,2,2407,3702,4 +2532,neg_21,call_function,neg.default,forward,21,1,1,1,2408,3701,8 +2533,exp_21,call_function,exp.default,forward,21,1,1,1,2409,3700,6 +2534,add_108,call_function,add.Tensor,forward,21,1,1,1,2410,3699,4 +2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6 +2536,convert_element_type_523,call_function,convert_element_type.default,forward,21,1,1,1,2412,3697,6 +2537,dtype_cast_197,call_function,dtype_cast.default,forward,21,1,1,1,1,3701,3 +2538,permute_240,call_function,permute.default,forward,21,1,1,1,2,3700,3 +2539,alias_default_616,call_function,alias.default,forward,21,1,1,2,3,3699,3 +2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5 +2541,alias_default_615,call_function,alias.default,forward,21,1,1,2,2413,3696,4 +2542,alias_default_617,call_function,alias.default,forward,21,1,1,2,2405,3696,4 +2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8 +2544,dtype_cast_198,call_function,dtype_cast.default,forward,21,1,1,1,1,3697,3 +2545,permute_241,call_function,permute.default,forward,21,1,1,1,2,3696,3 +2546,alias_default_618,call_function,alias.default,forward,21,1,1,2,2421,3694,4 +2547,alias_default_619,call_function,alias.default,forward,21,1,1,2,3,3695,3 +2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5 +2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10 +2550,dtype_cast_199,call_function,dtype_cast.default,forward,22,1,1,1,1,3681,2 +2551,alias_default_620,call_function,alias.default,forward,21,1,1,3,2428,3691,4 +2552,convert_element_type_528,call_function,convert_element_type.default,forward,22,1,1,1,2429,3689,4 +2553,alias_default_622,call_function,alias.default,forward,22,1,1,2,2430,3688,4 +2554,pow_45,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2431,3687,4 +2555,mean_44,call_function,mean.dim,forward,22,1,1,1,2432,3686,4 +2556,add_110,call_function,add.Scalar,forward,22,1,1,1,2433,3685,3 +2557,rsqrt_44,call_function,rsqrt.default,forward,22,1,1,1,2434,3684,3 +2558,alias_default_623,call_function,alias.default,forward,22,1,1,3,2435,3683,3 +2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8 +2560,alias_default_621,call_function,alias.default,forward,22,1,1,2,2,3680,2 +2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8 +2562,convert_element_type_529,call_function,convert_element_type.default,forward,22,1,1,1,2441,3677,6 +2563,dtype_cast_200,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3 +2564,permute_242,call_function,permute.default,forward,22,1,1,1,2,3663,3 +2565,alias_default_624,call_function,alias.default,forward,22,1,1,6,2442,3676,4 +2566,alias_default_625,call_function,alias.default,forward,22,1,1,2,3,3662,3 +2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 +2568,dtype_cast_201,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3 +2569,permute_243,call_function,permute.default,forward,22,1,1,1,2,3663,3 +2570,alias_default_626,call_function,alias.default,forward,22,1,1,2,3,3662,3 +2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 +2572,dtype_cast_202,call_function,dtype_cast.default,forward,22,1,1,1,1,3657,3 +2573,permute_244,call_function,permute.default,forward,22,1,1,1,2,3656,3 +2574,alias_default_627,call_function,alias.default,forward,22,1,1,2,3,3655,3 +2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5 +2576,view_512,call_function,view.default,forward,22,1,1,1,2448,3659,4 +2577,view_513,call_function,view.default,forward,22,1,1,1,2448,3659,4 +2578,view_514,call_function,view.default,forward,22,1,1,1,2448,3652,4 +2579,convert_element_type_536,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4 +2580,view_515,call_function,view.default,forward,22,1,1,1,2450,3657,4 +2581,view_as_complex_44,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6 +2582,convert_element_type_537,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4 +2583,view_516,call_function,view.default,forward,22,1,1,1,2450,3657,4 +2584,view_as_complex_45,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6 +2585,view_517,call_function,view.default,forward,22,1,1,1,2,3667,3 +2586,alias_default_628,call_function,alias.default,forward,22,1,1,4,3,3666,3 +2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 +2588,view_as_real_44,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6 +2589,view_518,call_function,view.default,forward,22,1,1,1,2456,3653,6 +2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 +2591,view_as_real_45,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6 +2592,view_519,call_function,view.default,forward,22,1,1,1,2456,3653,6 +2593,convert_element_type_538,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6 +2594,convert_element_type_539,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6 +2595,permute_245,call_function,permute.default,forward,22,1,1,1,2458,3651,6 +2596,permute_246,call_function,permute.default,forward,22,1,1,1,2458,3651,6 +2597,permute_247,call_function,permute.default,forward,22,1,1,1,2449,3651,4 +2598,alias_default_629,call_function,alias.default,forward,22,1,1,2,2459,3650,4 +2599,alias_default_630,call_function,alias.default,forward,22,1,1,2,2459,3650,4 +2600,alias_default_631,call_function,alias.default,forward,22,1,1,2,2450,3650,4 +2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2 +2602,getitem_198,call_function,getitem,forward,22,1,1,1,2484,3645,2 +2603,getitem_199,call_function,getitem,forward,22,1,1,1,2484,2484,2 +2604,getitem_204,call_function,getitem,forward,22,1,1,1,2484,2484,1 +2605,getitem_205,call_function,getitem,forward,22,1,1,1,2484,2484,1 +2606,alias_default_632,call_function,alias.default,forward,22,1,1,2,2485,3644,4 +2607,permute_248,call_function,permute.default,forward,22,1,1,1,2486,3643,4 +2608,view_520,call_function,view.default,forward,22,1,1,1,2487,3642,3 +2609,dtype_cast_203,call_function,dtype_cast.default,forward,22,1,1,1,1,3644,3 +2610,permute_249,call_function,permute.default,forward,22,1,1,1,2,3643,3 +2611,alias_default_633,call_function,alias.default,forward,22,1,1,2,2488,3641,4 +2612,alias_default_634,call_function,alias.default,forward,22,1,1,2,3,3642,3 +2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5 +2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10 +2615,dtype_cast_204,call_function,dtype_cast.default,forward,22,1,1,1,1,3628,2 +2616,alias_default_635,call_function,alias.default,forward,22,1,1,3,2495,3638,4 +2617,convert_element_type_542,call_function,convert_element_type.default,forward,22,1,1,1,2496,3636,4 +2618,alias_default_637,call_function,alias.default,forward,22,1,1,2,2497,3635,4 +2619,pow_46,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2498,3634,4 +2620,mean_45,call_function,mean.dim,forward,22,1,1,1,2499,3633,4 +2621,add_112,call_function,add.Scalar,forward,22,1,1,1,2500,3632,3 +2622,rsqrt_45,call_function,rsqrt.default,forward,22,1,1,1,2501,3631,3 +2623,alias_default_638,call_function,alias.default,forward,22,1,1,3,2502,3630,3 +2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8 +2625,alias_default_636,call_function,alias.default,forward,22,1,1,2,2,3627,2 +2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8 +2627,convert_element_type_543,call_function,convert_element_type.default,forward,22,1,1,1,2508,3624,6 +2628,dtype_cast_205,call_function,dtype_cast.default,forward,22,1,1,1,1,3624,3 +2629,permute_250,call_function,permute.default,forward,22,1,1,1,2,3623,3 +2630,alias_default_639,call_function,alias.default,forward,22,1,1,4,2509,3623,4 +2631,alias_default_640,call_function,alias.default,forward,22,1,1,2,3,3622,3 +2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5 +2633,alias_default_641,call_function,alias.default,forward,22,1,1,2,2515,3619,4 +2634,convert_element_type_546,call_function,convert_element_type.default,forward,22,1,1,1,2516,3607,4 +2635,alias_default_642,call_function,alias.default,forward,22,1,1,2,2517,3606,4 +2636,neg_22,call_function,neg.default,forward,22,1,1,1,2518,3605,8 +2637,exp_22,call_function,exp.default,forward,22,1,1,1,2519,3604,6 +2638,add_113,call_function,add.Tensor,forward,22,1,1,1,2520,3603,4 +2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6 +2640,convert_element_type_547,call_function,convert_element_type.default,forward,22,1,1,1,2522,3601,6 +2641,dtype_cast_206,call_function,dtype_cast.default,forward,22,1,1,1,1,3605,3 +2642,permute_251,call_function,permute.default,forward,22,1,1,1,2,3604,3 +2643,alias_default_644,call_function,alias.default,forward,22,1,1,2,3,3603,3 +2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5 +2645,alias_default_643,call_function,alias.default,forward,22,1,1,2,2523,3600,4 +2646,alias_default_645,call_function,alias.default,forward,22,1,1,2,2515,3600,4 +2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8 +2648,dtype_cast_207,call_function,dtype_cast.default,forward,22,1,1,1,1,3601,3 +2649,permute_252,call_function,permute.default,forward,22,1,1,1,2,3600,3 +2650,alias_default_646,call_function,alias.default,forward,22,1,1,2,2531,3598,4 +2651,alias_default_647,call_function,alias.default,forward,22,1,1,2,3,3599,3 +2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5 +2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10 +2654,dtype_cast_208,call_function,dtype_cast.default,forward,23,1,1,1,1,3585,2 +2655,alias_default_648,call_function,alias.default,forward,22,1,1,3,2538,3595,4 +2656,convert_element_type_552,call_function,convert_element_type.default,forward,23,1,1,1,2539,3593,4 +2657,alias_default_650,call_function,alias.default,forward,23,1,1,2,2540,3592,4 +2658,pow_47,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2541,3591,4 +2659,mean_46,call_function,mean.dim,forward,23,1,1,1,2542,3590,4 +2660,add_115,call_function,add.Scalar,forward,23,1,1,1,2543,3589,3 +2661,rsqrt_46,call_function,rsqrt.default,forward,23,1,1,1,2544,3588,3 +2662,alias_default_651,call_function,alias.default,forward,23,1,1,3,2545,3587,3 +2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8 +2664,alias_default_649,call_function,alias.default,forward,23,1,1,2,2,3584,2 +2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8 +2666,convert_element_type_553,call_function,convert_element_type.default,forward,23,1,1,1,2551,3581,6 +2667,dtype_cast_209,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3 +2668,permute_253,call_function,permute.default,forward,23,1,1,1,2,3567,3 +2669,alias_default_652,call_function,alias.default,forward,23,1,1,6,2552,3580,4 +2670,alias_default_653,call_function,alias.default,forward,23,1,1,2,3,3566,3 +2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 +2672,dtype_cast_210,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3 +2673,permute_254,call_function,permute.default,forward,23,1,1,1,2,3567,3 +2674,alias_default_654,call_function,alias.default,forward,23,1,1,2,3,3566,3 +2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 +2676,dtype_cast_211,call_function,dtype_cast.default,forward,23,1,1,1,1,3561,3 +2677,permute_255,call_function,permute.default,forward,23,1,1,1,2,3560,3 +2678,alias_default_655,call_function,alias.default,forward,23,1,1,2,3,3559,3 +2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5 +2680,view_535,call_function,view.default,forward,23,1,1,1,2558,3563,4 +2681,view_536,call_function,view.default,forward,23,1,1,1,2558,3563,4 +2682,view_537,call_function,view.default,forward,23,1,1,1,2558,3556,4 +2683,convert_element_type_560,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4 +2684,view_538,call_function,view.default,forward,23,1,1,1,2560,3561,4 +2685,view_as_complex_46,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6 +2686,convert_element_type_561,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4 +2687,view_539,call_function,view.default,forward,23,1,1,1,2560,3561,4 +2688,view_as_complex_47,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6 +2689,view_540,call_function,view.default,forward,23,1,1,1,2,3571,3 +2690,alias_default_656,call_function,alias.default,forward,23,1,1,4,3,3570,3 +2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 +2692,view_as_real_46,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6 +2693,view_541,call_function,view.default,forward,23,1,1,1,2566,3557,6 +2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 +2695,view_as_real_47,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6 +2696,view_542,call_function,view.default,forward,23,1,1,1,2566,3557,6 +2697,convert_element_type_562,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6 +2698,convert_element_type_563,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6 +2699,permute_256,call_function,permute.default,forward,23,1,1,1,2568,3555,6 +2700,permute_257,call_function,permute.default,forward,23,1,1,1,2568,3555,6 +2701,permute_258,call_function,permute.default,forward,23,1,1,1,2559,3555,4 +2702,alias_default_657,call_function,alias.default,forward,23,1,1,2,2569,3554,4 +2703,alias_default_658,call_function,alias.default,forward,23,1,1,2,2569,3554,4 +2704,alias_default_659,call_function,alias.default,forward,23,1,1,2,2560,3554,4 +2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2 +2706,getitem_207,call_function,getitem,forward,23,1,1,1,2594,3549,2 +2707,getitem_208,call_function,getitem,forward,23,1,1,1,2594,2594,2 +2708,getitem_213,call_function,getitem,forward,23,1,1,1,2594,2594,1 +2709,getitem_214,call_function,getitem,forward,23,1,1,1,2594,2594,1 +2710,alias_default_660,call_function,alias.default,forward,23,1,1,2,2595,3548,4 +2711,permute_259,call_function,permute.default,forward,23,1,1,1,2596,3547,4 +2712,view_543,call_function,view.default,forward,23,1,1,1,2597,3546,3 +2713,dtype_cast_212,call_function,dtype_cast.default,forward,23,1,1,1,1,3548,3 +2714,permute_260,call_function,permute.default,forward,23,1,1,1,2,3547,3 +2715,alias_default_661,call_function,alias.default,forward,23,1,1,2,2598,3545,4 +2716,alias_default_662,call_function,alias.default,forward,23,1,1,2,3,3546,3 +2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5 +2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10 +2719,dtype_cast_213,call_function,dtype_cast.default,forward,23,1,1,1,1,3532,2 +2720,alias_default_663,call_function,alias.default,forward,23,1,1,3,2605,3542,4 +2721,convert_element_type_566,call_function,convert_element_type.default,forward,23,1,1,1,2606,3540,4 +2722,alias_default_665,call_function,alias.default,forward,23,1,1,2,2607,3539,4 +2723,pow_48,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2608,3538,4 +2724,mean_47,call_function,mean.dim,forward,23,1,1,1,2609,3537,4 +2725,add_117,call_function,add.Scalar,forward,23,1,1,1,2610,3536,3 +2726,rsqrt_47,call_function,rsqrt.default,forward,23,1,1,1,2611,3535,3 +2727,alias_default_666,call_function,alias.default,forward,23,1,1,3,2612,3534,3 +2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8 +2729,alias_default_664,call_function,alias.default,forward,23,1,1,2,2,3531,2 +2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8 +2731,convert_element_type_567,call_function,convert_element_type.default,forward,23,1,1,1,2618,3528,6 +2732,dtype_cast_214,call_function,dtype_cast.default,forward,23,1,1,1,1,3528,3 +2733,permute_261,call_function,permute.default,forward,23,1,1,1,2,3527,3 +2734,alias_default_667,call_function,alias.default,forward,23,1,1,4,2619,3527,4 +2735,alias_default_668,call_function,alias.default,forward,23,1,1,2,3,3526,3 +2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5 +2737,alias_default_669,call_function,alias.default,forward,23,1,1,2,2625,3523,4 +2738,convert_element_type_570,call_function,convert_element_type.default,forward,23,1,1,1,2626,3511,4 +2739,alias_default_670,call_function,alias.default,forward,23,1,1,2,2627,3510,4 +2740,neg_23,call_function,neg.default,forward,23,1,1,1,2628,3509,8 +2741,exp_23,call_function,exp.default,forward,23,1,1,1,2629,3508,6 +2742,add_118,call_function,add.Tensor,forward,23,1,1,1,2630,3507,4 +2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6 +2744,convert_element_type_571,call_function,convert_element_type.default,forward,23,1,1,1,2632,3505,6 +2745,dtype_cast_215,call_function,dtype_cast.default,forward,23,1,1,1,1,3509,3 +2746,permute_262,call_function,permute.default,forward,23,1,1,1,2,3508,3 +2747,alias_default_672,call_function,alias.default,forward,23,1,1,2,3,3507,3 +2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5 +2749,alias_default_671,call_function,alias.default,forward,23,1,1,2,2633,3504,4 +2750,alias_default_673,call_function,alias.default,forward,23,1,1,2,2625,3504,4 +2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8 +2752,dtype_cast_216,call_function,dtype_cast.default,forward,23,1,1,1,1,3505,3 +2753,permute_263,call_function,permute.default,forward,23,1,1,1,2,3504,3 +2754,alias_default_674,call_function,alias.default,forward,23,1,1,2,2641,3502,4 +2755,alias_default_675,call_function,alias.default,forward,23,1,1,2,3,3503,3 +2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5 +2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10 +2758,dtype_cast_217,call_function,dtype_cast.default,forward,24,1,1,1,1,3489,2 +2759,alias_default_676,call_function,alias.default,forward,23,1,1,3,2648,3499,4 +2760,convert_element_type_576,call_function,convert_element_type.default,forward,24,1,1,1,2649,3497,4 +2761,alias_default_678,call_function,alias.default,forward,24,1,1,2,2650,3496,4 +2762,pow_49,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2651,3495,4 +2763,mean_48,call_function,mean.dim,forward,24,1,1,1,2652,3494,4 +2764,add_120,call_function,add.Scalar,forward,24,1,1,1,2653,3493,3 +2765,rsqrt_48,call_function,rsqrt.default,forward,24,1,1,1,2654,3492,3 +2766,alias_default_679,call_function,alias.default,forward,24,1,1,3,2655,3491,3 +2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8 +2768,alias_default_677,call_function,alias.default,forward,24,1,1,2,2,3488,2 +2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8 +2770,convert_element_type_577,call_function,convert_element_type.default,forward,24,1,1,1,2661,3485,6 +2771,dtype_cast_218,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3 +2772,permute_264,call_function,permute.default,forward,24,1,1,1,2,3471,3 +2773,alias_default_680,call_function,alias.default,forward,24,1,1,6,2662,3484,4 +2774,alias_default_681,call_function,alias.default,forward,24,1,1,2,3,3470,3 +2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 +2776,dtype_cast_219,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3 +2777,permute_265,call_function,permute.default,forward,24,1,1,1,2,3471,3 +2778,alias_default_682,call_function,alias.default,forward,24,1,1,2,3,3470,3 +2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 +2780,dtype_cast_220,call_function,dtype_cast.default,forward,24,1,1,1,1,3465,3 +2781,permute_266,call_function,permute.default,forward,24,1,1,1,2,3464,3 +2782,alias_default_683,call_function,alias.default,forward,24,1,1,2,3,3463,3 +2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5 +2784,view_558,call_function,view.default,forward,24,1,1,1,2668,3467,4 +2785,view_559,call_function,view.default,forward,24,1,1,1,2668,3467,4 +2786,view_560,call_function,view.default,forward,24,1,1,1,2668,3460,4 +2787,convert_element_type_584,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4 +2788,view_561,call_function,view.default,forward,24,1,1,1,2670,3465,4 +2789,view_as_complex_48,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6 +2790,convert_element_type_585,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4 +2791,view_562,call_function,view.default,forward,24,1,1,1,2670,3465,4 +2792,view_as_complex_49,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6 +2793,view_563,call_function,view.default,forward,24,1,1,1,2,3475,3 +2794,alias_default_684,call_function,alias.default,forward,24,1,1,4,3,3474,3 +2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 +2796,view_as_real_48,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6 +2797,view_564,call_function,view.default,forward,24,1,1,1,2676,3461,6 +2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 +2799,view_as_real_49,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6 +2800,view_565,call_function,view.default,forward,24,1,1,1,2676,3461,6 +2801,convert_element_type_586,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6 +2802,convert_element_type_587,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6 +2803,permute_267,call_function,permute.default,forward,24,1,1,1,2678,3459,6 +2804,permute_268,call_function,permute.default,forward,24,1,1,1,2678,3459,6 +2805,permute_269,call_function,permute.default,forward,24,1,1,1,2669,3459,4 +2806,alias_default_685,call_function,alias.default,forward,24,1,1,2,2679,3458,4 +2807,alias_default_686,call_function,alias.default,forward,24,1,1,2,2679,3458,4 +2808,alias_default_687,call_function,alias.default,forward,24,1,1,2,2670,3458,4 +2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2 +2810,getitem_216,call_function,getitem,forward,24,1,1,1,2704,3453,2 +2811,getitem_217,call_function,getitem,forward,24,1,1,1,2704,2704,2 +2812,getitem_222,call_function,getitem,forward,24,1,1,1,2704,2704,1 +2813,getitem_223,call_function,getitem,forward,24,1,1,1,2704,2704,1 +2814,alias_default_688,call_function,alias.default,forward,24,1,1,2,2705,3452,4 +2815,permute_270,call_function,permute.default,forward,24,1,1,1,2706,3451,4 +2816,view_566,call_function,view.default,forward,24,1,1,1,2707,3450,3 +2817,dtype_cast_221,call_function,dtype_cast.default,forward,24,1,1,1,1,3452,3 +2818,permute_271,call_function,permute.default,forward,24,1,1,1,2,3451,3 +2819,alias_default_689,call_function,alias.default,forward,24,1,1,2,2708,3449,4 +2820,alias_default_690,call_function,alias.default,forward,24,1,1,2,3,3450,3 +2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5 +2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10 +2823,dtype_cast_222,call_function,dtype_cast.default,forward,24,1,1,1,1,3436,2 +2824,alias_default_691,call_function,alias.default,forward,24,1,1,3,2715,3446,4 +2825,convert_element_type_590,call_function,convert_element_type.default,forward,24,1,1,1,2716,3444,4 +2826,alias_default_693,call_function,alias.default,forward,24,1,1,2,2717,3443,4 +2827,pow_50,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2718,3442,4 +2828,mean_49,call_function,mean.dim,forward,24,1,1,1,2719,3441,4 +2829,add_122,call_function,add.Scalar,forward,24,1,1,1,2720,3440,3 +2830,rsqrt_49,call_function,rsqrt.default,forward,24,1,1,1,2721,3439,3 +2831,alias_default_694,call_function,alias.default,forward,24,1,1,3,2722,3438,3 +2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8 +2833,alias_default_692,call_function,alias.default,forward,24,1,1,2,2,3435,2 +2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8 +2835,convert_element_type_591,call_function,convert_element_type.default,forward,24,1,1,1,2728,3432,6 +2836,dtype_cast_223,call_function,dtype_cast.default,forward,24,1,1,1,1,3432,3 +2837,permute_272,call_function,permute.default,forward,24,1,1,1,2,3431,3 +2838,alias_default_695,call_function,alias.default,forward,24,1,1,4,2729,3431,4 +2839,alias_default_696,call_function,alias.default,forward,24,1,1,2,3,3430,3 +2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5 +2841,alias_default_697,call_function,alias.default,forward,24,1,1,2,2735,3427,4 +2842,convert_element_type_594,call_function,convert_element_type.default,forward,24,1,1,1,2736,3415,4 +2843,alias_default_698,call_function,alias.default,forward,24,1,1,2,2737,3414,4 +2844,neg_24,call_function,neg.default,forward,24,1,1,1,2738,3413,8 +2845,exp_24,call_function,exp.default,forward,24,1,1,1,2739,3412,6 +2846,add_123,call_function,add.Tensor,forward,24,1,1,1,2740,3411,4 +2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6 +2848,convert_element_type_595,call_function,convert_element_type.default,forward,24,1,1,1,2742,3409,6 +2849,dtype_cast_224,call_function,dtype_cast.default,forward,24,1,1,1,1,3413,3 +2850,permute_273,call_function,permute.default,forward,24,1,1,1,2,3412,3 +2851,alias_default_700,call_function,alias.default,forward,24,1,1,2,3,3411,3 +2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5 +2853,alias_default_699,call_function,alias.default,forward,24,1,1,2,2743,3408,4 +2854,alias_default_701,call_function,alias.default,forward,24,1,1,2,2735,3408,4 +2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8 +2856,dtype_cast_225,call_function,dtype_cast.default,forward,24,1,1,1,1,3409,3 +2857,permute_274,call_function,permute.default,forward,24,1,1,1,2,3408,3 +2858,alias_default_702,call_function,alias.default,forward,24,1,1,2,2751,3406,4 +2859,alias_default_703,call_function,alias.default,forward,24,1,1,2,3,3407,3 +2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5 +2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10 +2862,dtype_cast_226,call_function,dtype_cast.default,forward,25,1,1,1,1,3393,2 +2863,alias_default_704,call_function,alias.default,forward,24,1,1,3,2758,3403,4 +2864,convert_element_type_600,call_function,convert_element_type.default,forward,25,1,1,1,2759,3401,4 +2865,alias_default_706,call_function,alias.default,forward,25,1,1,2,2760,3400,4 +2866,pow_51,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2761,3399,4 +2867,mean_50,call_function,mean.dim,forward,25,1,1,1,2762,3398,4 +2868,add_125,call_function,add.Scalar,forward,25,1,1,1,2763,3397,3 +2869,rsqrt_50,call_function,rsqrt.default,forward,25,1,1,1,2764,3396,3 +2870,alias_default_707,call_function,alias.default,forward,25,1,1,3,2765,3395,3 +2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8 +2872,alias_default_705,call_function,alias.default,forward,25,1,1,2,2,3392,2 +2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8 +2874,convert_element_type_601,call_function,convert_element_type.default,forward,25,1,1,1,2771,3389,6 +2875,dtype_cast_227,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3 +2876,permute_275,call_function,permute.default,forward,25,1,1,1,2,3375,3 +2877,alias_default_708,call_function,alias.default,forward,25,1,1,6,2772,3388,4 +2878,alias_default_709,call_function,alias.default,forward,25,1,1,2,3,3374,3 +2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 +2880,dtype_cast_228,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3 +2881,permute_276,call_function,permute.default,forward,25,1,1,1,2,3375,3 +2882,alias_default_710,call_function,alias.default,forward,25,1,1,2,3,3374,3 +2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 +2884,dtype_cast_229,call_function,dtype_cast.default,forward,25,1,1,1,1,3369,3 +2885,permute_277,call_function,permute.default,forward,25,1,1,1,2,3368,3 +2886,alias_default_711,call_function,alias.default,forward,25,1,1,2,3,3367,3 +2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5 +2888,view_581,call_function,view.default,forward,25,1,1,1,2778,3371,4 +2889,view_582,call_function,view.default,forward,25,1,1,1,2778,3371,4 +2890,view_583,call_function,view.default,forward,25,1,1,1,2778,3364,4 +2891,convert_element_type_608,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4 +2892,view_584,call_function,view.default,forward,25,1,1,1,2780,3369,4 +2893,view_as_complex_50,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6 +2894,convert_element_type_609,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4 +2895,view_585,call_function,view.default,forward,25,1,1,1,2780,3369,4 +2896,view_as_complex_51,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6 +2897,view_586,call_function,view.default,forward,25,1,1,1,2,3379,3 +2898,alias_default_712,call_function,alias.default,forward,25,1,1,4,3,3378,3 +2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 +2900,view_as_real_50,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6 +2901,view_587,call_function,view.default,forward,25,1,1,1,2786,3365,6 +2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 +2903,view_as_real_51,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6 +2904,view_588,call_function,view.default,forward,25,1,1,1,2786,3365,6 +2905,convert_element_type_610,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6 +2906,convert_element_type_611,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6 +2907,permute_278,call_function,permute.default,forward,25,1,1,1,2788,3363,6 +2908,permute_279,call_function,permute.default,forward,25,1,1,1,2788,3363,6 +2909,permute_280,call_function,permute.default,forward,25,1,1,1,2779,3363,4 +2910,alias_default_713,call_function,alias.default,forward,25,1,1,2,2789,3362,4 +2911,alias_default_714,call_function,alias.default,forward,25,1,1,2,2789,3362,4 +2912,alias_default_715,call_function,alias.default,forward,25,1,1,2,2780,3362,4 +2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2 +2914,getitem_225,call_function,getitem,forward,25,1,1,1,2814,3357,2 +2915,getitem_226,call_function,getitem,forward,25,1,1,1,2814,2814,2 +2916,getitem_231,call_function,getitem,forward,25,1,1,1,2814,2814,1 +2917,getitem_232,call_function,getitem,forward,25,1,1,1,2814,2814,1 +2918,alias_default_716,call_function,alias.default,forward,25,1,1,2,2815,3356,4 +2919,permute_281,call_function,permute.default,forward,25,1,1,1,2816,3355,4 +2920,view_589,call_function,view.default,forward,25,1,1,1,2817,3354,3 +2921,dtype_cast_230,call_function,dtype_cast.default,forward,25,1,1,1,1,3356,3 +2922,permute_282,call_function,permute.default,forward,25,1,1,1,2,3355,3 +2923,alias_default_717,call_function,alias.default,forward,25,1,1,2,2818,3353,4 +2924,alias_default_718,call_function,alias.default,forward,25,1,1,2,3,3354,3 +2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5 +2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10 +2927,dtype_cast_231,call_function,dtype_cast.default,forward,25,1,1,1,1,3340,2 +2928,alias_default_719,call_function,alias.default,forward,25,1,1,3,2825,3350,4 +2929,convert_element_type_614,call_function,convert_element_type.default,forward,25,1,1,1,2826,3348,4 +2930,alias_default_721,call_function,alias.default,forward,25,1,1,2,2827,3347,4 +2931,pow_52,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2828,3346,4 +2932,mean_51,call_function,mean.dim,forward,25,1,1,1,2829,3345,4 +2933,add_127,call_function,add.Scalar,forward,25,1,1,1,2830,3344,3 +2934,rsqrt_51,call_function,rsqrt.default,forward,25,1,1,1,2831,3343,3 +2935,alias_default_722,call_function,alias.default,forward,25,1,1,3,2832,3342,3 +2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8 +2937,alias_default_720,call_function,alias.default,forward,25,1,1,2,2,3339,2 +2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8 +2939,convert_element_type_615,call_function,convert_element_type.default,forward,25,1,1,1,2838,3336,6 +2940,dtype_cast_232,call_function,dtype_cast.default,forward,25,1,1,1,1,3336,3 +2941,permute_283,call_function,permute.default,forward,25,1,1,1,2,3335,3 +2942,alias_default_723,call_function,alias.default,forward,25,1,1,4,2839,3335,4 +2943,alias_default_724,call_function,alias.default,forward,25,1,1,2,3,3334,3 +2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5 +2945,alias_default_725,call_function,alias.default,forward,25,1,1,2,2845,3331,4 +2946,convert_element_type_618,call_function,convert_element_type.default,forward,25,1,1,1,2846,3319,4 +2947,alias_default_726,call_function,alias.default,forward,25,1,1,2,2847,3318,4 +2948,neg_25,call_function,neg.default,forward,25,1,1,1,2848,3317,8 +2949,exp_25,call_function,exp.default,forward,25,1,1,1,2849,3316,6 +2950,add_128,call_function,add.Tensor,forward,25,1,1,1,2850,3315,4 +2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6 +2952,convert_element_type_619,call_function,convert_element_type.default,forward,25,1,1,1,2852,3313,6 +2953,dtype_cast_233,call_function,dtype_cast.default,forward,25,1,1,1,1,3317,3 +2954,permute_284,call_function,permute.default,forward,25,1,1,1,2,3316,3 +2955,alias_default_728,call_function,alias.default,forward,25,1,1,2,3,3315,3 +2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5 +2957,alias_default_727,call_function,alias.default,forward,25,1,1,2,2853,3312,4 +2958,alias_default_729,call_function,alias.default,forward,25,1,1,2,2845,3312,4 +2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8 +2960,dtype_cast_234,call_function,dtype_cast.default,forward,25,1,1,1,1,3313,3 +2961,permute_285,call_function,permute.default,forward,25,1,1,1,2,3312,3 +2962,alias_default_730,call_function,alias.default,forward,25,1,1,2,2861,3310,4 +2963,alias_default_731,call_function,alias.default,forward,25,1,1,2,3,3311,3 +2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5 +2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10 +2966,dtype_cast_235,call_function,dtype_cast.default,forward,26,1,1,1,1,3297,2 +2967,alias_default_732,call_function,alias.default,forward,25,1,1,3,2868,3307,4 +2968,convert_element_type_624,call_function,convert_element_type.default,forward,26,1,1,1,2869,3305,4 +2969,alias_default_734,call_function,alias.default,forward,26,1,1,2,2870,3304,4 +2970,pow_53,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2871,3303,4 +2971,mean_52,call_function,mean.dim,forward,26,1,1,1,2872,3302,4 +2972,add_130,call_function,add.Scalar,forward,26,1,1,1,2873,3301,3 +2973,rsqrt_52,call_function,rsqrt.default,forward,26,1,1,1,2874,3300,3 +2974,alias_default_735,call_function,alias.default,forward,26,1,1,3,2875,3299,3 +2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8 +2976,alias_default_733,call_function,alias.default,forward,26,1,1,2,2,3296,2 +2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8 +2978,convert_element_type_625,call_function,convert_element_type.default,forward,26,1,1,1,2881,3293,6 +2979,dtype_cast_236,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3 +2980,permute_286,call_function,permute.default,forward,26,1,1,1,2,3279,3 +2981,alias_default_736,call_function,alias.default,forward,26,1,1,6,2882,3292,4 +2982,alias_default_737,call_function,alias.default,forward,26,1,1,2,3,3278,3 +2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 +2984,dtype_cast_237,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3 +2985,permute_287,call_function,permute.default,forward,26,1,1,1,2,3279,3 +2986,alias_default_738,call_function,alias.default,forward,26,1,1,2,3,3278,3 +2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 +2988,dtype_cast_238,call_function,dtype_cast.default,forward,26,1,1,1,1,3273,3 +2989,permute_288,call_function,permute.default,forward,26,1,1,1,2,3272,3 +2990,alias_default_739,call_function,alias.default,forward,26,1,1,2,3,3271,3 +2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5 +2992,view_604,call_function,view.default,forward,26,1,1,1,2888,3275,4 +2993,view_605,call_function,view.default,forward,26,1,1,1,2888,3275,4 +2994,view_606,call_function,view.default,forward,26,1,1,1,2888,3268,4 +2995,convert_element_type_632,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4 +2996,view_607,call_function,view.default,forward,26,1,1,1,2890,3273,4 +2997,view_as_complex_52,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6 +2998,convert_element_type_633,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4 +2999,view_608,call_function,view.default,forward,26,1,1,1,2890,3273,4 +3000,view_as_complex_53,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6 +3001,view_609,call_function,view.default,forward,26,1,1,1,2,3283,3 +3002,alias_default_740,call_function,alias.default,forward,26,1,1,4,3,3282,3 +3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 +3004,view_as_real_52,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6 +3005,view_610,call_function,view.default,forward,26,1,1,1,2896,3269,6 +3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 +3007,view_as_real_53,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6 +3008,view_611,call_function,view.default,forward,26,1,1,1,2896,3269,6 +3009,convert_element_type_634,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6 +3010,convert_element_type_635,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6 +3011,permute_289,call_function,permute.default,forward,26,1,1,1,2898,3267,6 +3012,permute_290,call_function,permute.default,forward,26,1,1,1,2898,3267,6 +3013,permute_291,call_function,permute.default,forward,26,1,1,1,2889,3267,4 +3014,alias_default_741,call_function,alias.default,forward,26,1,1,2,2899,3266,4 +3015,alias_default_742,call_function,alias.default,forward,26,1,1,2,2899,3266,4 +3016,alias_default_743,call_function,alias.default,forward,26,1,1,2,2890,3266,4 +3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2 +3018,getitem_234,call_function,getitem,forward,26,1,1,1,2924,3261,2 +3019,getitem_235,call_function,getitem,forward,26,1,1,1,2924,2924,2 +3020,getitem_240,call_function,getitem,forward,26,1,1,1,2924,2924,1 +3021,getitem_241,call_function,getitem,forward,26,1,1,1,2924,2924,1 +3022,alias_default_744,call_function,alias.default,forward,26,1,1,2,2925,3260,4 +3023,permute_292,call_function,permute.default,forward,26,1,1,1,2926,3259,4 +3024,view_612,call_function,view.default,forward,26,1,1,1,2927,3258,3 +3025,dtype_cast_239,call_function,dtype_cast.default,forward,26,1,1,1,1,3260,3 +3026,permute_293,call_function,permute.default,forward,26,1,1,1,2,3259,3 +3027,alias_default_745,call_function,alias.default,forward,26,1,1,2,2928,3257,4 +3028,alias_default_746,call_function,alias.default,forward,26,1,1,2,3,3258,3 +3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5 +3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10 +3031,dtype_cast_240,call_function,dtype_cast.default,forward,26,1,1,1,1,3244,2 +3032,alias_default_747,call_function,alias.default,forward,26,1,1,3,2935,3254,4 +3033,convert_element_type_638,call_function,convert_element_type.default,forward,26,1,1,1,2936,3252,4 +3034,alias_default_749,call_function,alias.default,forward,26,1,1,2,2937,3251,4 +3035,pow_54,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2938,3250,4 +3036,mean_53,call_function,mean.dim,forward,26,1,1,1,2939,3249,4 +3037,add_132,call_function,add.Scalar,forward,26,1,1,1,2940,3248,3 +3038,rsqrt_53,call_function,rsqrt.default,forward,26,1,1,1,2941,3247,3 +3039,alias_default_750,call_function,alias.default,forward,26,1,1,3,2942,3246,3 +3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8 +3041,alias_default_748,call_function,alias.default,forward,26,1,1,2,2,3243,2 +3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8 +3043,convert_element_type_639,call_function,convert_element_type.default,forward,26,1,1,1,2948,3240,6 +3044,dtype_cast_241,call_function,dtype_cast.default,forward,26,1,1,1,1,3240,3 +3045,permute_294,call_function,permute.default,forward,26,1,1,1,2,3239,3 +3046,alias_default_751,call_function,alias.default,forward,26,1,1,4,2949,3239,4 +3047,alias_default_752,call_function,alias.default,forward,26,1,1,2,3,3238,3 +3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5 +3049,alias_default_753,call_function,alias.default,forward,26,1,1,2,2955,3235,4 +3050,convert_element_type_642,call_function,convert_element_type.default,forward,26,1,1,1,2956,3223,4 +3051,alias_default_754,call_function,alias.default,forward,26,1,1,2,2957,3222,4 +3052,neg_26,call_function,neg.default,forward,26,1,1,1,2958,3221,8 +3053,exp_26,call_function,exp.default,forward,26,1,1,1,2959,3220,6 +3054,add_133,call_function,add.Tensor,forward,26,1,1,1,2960,3219,4 +3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6 +3056,convert_element_type_643,call_function,convert_element_type.default,forward,26,1,1,1,2962,3217,6 +3057,dtype_cast_242,call_function,dtype_cast.default,forward,26,1,1,1,1,3221,3 +3058,permute_295,call_function,permute.default,forward,26,1,1,1,2,3220,3 +3059,alias_default_756,call_function,alias.default,forward,26,1,1,2,3,3219,3 +3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5 +3061,alias_default_755,call_function,alias.default,forward,26,1,1,2,2963,3216,4 +3062,alias_default_757,call_function,alias.default,forward,26,1,1,2,2955,3216,4 +3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8 +3064,dtype_cast_243,call_function,dtype_cast.default,forward,26,1,1,1,1,3217,3 +3065,permute_296,call_function,permute.default,forward,26,1,1,1,2,3216,3 +3066,alias_default_758,call_function,alias.default,forward,26,1,1,2,2971,3214,4 +3067,alias_default_759,call_function,alias.default,forward,26,1,1,2,3,3215,3 +3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5 +3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10 +3070,dtype_cast_244,call_function,dtype_cast.default,forward,27,1,1,1,1,3201,2 +3071,alias_default_760,call_function,alias.default,forward,26,1,1,3,2978,3211,4 +3072,convert_element_type_648,call_function,convert_element_type.default,forward,27,1,1,1,2979,3209,4 +3073,alias_default_762,call_function,alias.default,forward,27,1,1,2,2980,3208,4 +3074,pow_55,call_function,pow.Tensor_Scalar,forward,27,1,1,1,2981,3207,4 +3075,mean_54,call_function,mean.dim,forward,27,1,1,1,2982,3206,4 +3076,add_135,call_function,add.Scalar,forward,27,1,1,1,2983,3205,3 +3077,rsqrt_54,call_function,rsqrt.default,forward,27,1,1,1,2984,3204,3 +3078,alias_default_763,call_function,alias.default,forward,27,1,1,3,2985,3203,3 +3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8 +3080,alias_default_761,call_function,alias.default,forward,27,1,1,2,2,3200,2 +3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8 +3082,convert_element_type_649,call_function,convert_element_type.default,forward,27,1,1,1,2991,3197,6 +3083,dtype_cast_245,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3 +3084,permute_297,call_function,permute.default,forward,27,1,1,1,2,3183,3 +3085,alias_default_764,call_function,alias.default,forward,27,1,1,6,2992,3196,4 +3086,alias_default_765,call_function,alias.default,forward,27,1,1,2,3,3182,3 +3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 +3088,dtype_cast_246,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3 +3089,permute_298,call_function,permute.default,forward,27,1,1,1,2,3183,3 +3090,alias_default_766,call_function,alias.default,forward,27,1,1,2,3,3182,3 +3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 +3092,dtype_cast_247,call_function,dtype_cast.default,forward,27,1,1,1,1,3177,3 +3093,permute_299,call_function,permute.default,forward,27,1,1,1,2,3176,3 +3094,alias_default_767,call_function,alias.default,forward,27,1,1,2,3,3175,3 +3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5 +3096,view_627,call_function,view.default,forward,27,1,1,1,2998,3179,4 +3097,view_628,call_function,view.default,forward,27,1,1,1,2998,3179,4 +3098,view_629,call_function,view.default,forward,27,1,1,1,2998,3172,4 +3099,convert_element_type_656,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4 +3100,view_630,call_function,view.default,forward,27,1,1,1,3000,3177,4 +3101,view_as_complex_54,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6 +3102,convert_element_type_657,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4 +3103,view_631,call_function,view.default,forward,27,1,1,1,3000,3177,4 +3104,view_as_complex_55,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6 +3105,view_632,call_function,view.default,forward,27,1,1,1,2,3187,3 +3106,alias_default_768,call_function,alias.default,forward,27,1,1,4,3,3186,3 +3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 +3108,view_as_real_54,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6 +3109,view_633,call_function,view.default,forward,27,1,1,1,3006,3173,6 +3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 +3111,view_as_real_55,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6 +3112,view_634,call_function,view.default,forward,27,1,1,1,3006,3173,6 +3113,convert_element_type_658,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6 +3114,convert_element_type_659,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6 +3115,permute_300,call_function,permute.default,forward,27,1,1,1,3008,3171,6 +3116,permute_301,call_function,permute.default,forward,27,1,1,1,3008,3171,6 +3117,permute_302,call_function,permute.default,forward,27,1,1,1,2999,3171,4 +3118,alias_default_769,call_function,alias.default,forward,27,1,1,2,3009,3170,4 +3119,alias_default_770,call_function,alias.default,forward,27,1,1,2,3009,3170,4 +3120,alias_default_771,call_function,alias.default,forward,27,1,1,2,3000,3170,4 +3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2 +3122,getitem_243,call_function,getitem,forward,27,1,1,1,3034,3165,2 +3123,getitem_244,call_function,getitem,forward,27,1,1,1,3034,3034,2 +3124,getitem_249,call_function,getitem,forward,27,1,1,1,3034,3034,1 +3125,getitem_250,call_function,getitem,forward,27,1,1,1,3034,3034,1 +3126,alias_default_772,call_function,alias.default,forward,27,1,1,2,3035,3164,4 +3127,permute_303,call_function,permute.default,forward,27,1,1,1,3036,3163,4 +3128,view_635,call_function,view.default,forward,27,1,1,1,3037,3162,3 +3129,dtype_cast_248,call_function,dtype_cast.default,forward,27,1,1,1,1,3164,3 +3130,permute_304,call_function,permute.default,forward,27,1,1,1,2,3163,3 +3131,alias_default_773,call_function,alias.default,forward,27,1,1,2,3038,3161,4 +3132,alias_default_774,call_function,alias.default,forward,27,1,1,2,3,3162,3 +3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5 +3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10 +3135,dtype_cast_249,call_function,dtype_cast.default,forward,27,1,1,1,1,3148,2 +3136,alias_default_775,call_function,alias.default,forward,27,1,1,3,3045,3158,4 +3137,convert_element_type_662,call_function,convert_element_type.default,forward,27,1,1,1,3046,3156,4 +3138,alias_default_777,call_function,alias.default,forward,27,1,1,2,3047,3155,4 +3139,pow_56,call_function,pow.Tensor_Scalar,forward,27,1,1,1,3048,3154,4 +3140,mean_55,call_function,mean.dim,forward,27,1,1,1,3049,3153,4 +3141,add_137,call_function,add.Scalar,forward,27,1,1,1,3050,3152,3 +3142,rsqrt_55,call_function,rsqrt.default,forward,27,1,1,1,3051,3151,3 +3143,alias_default_778,call_function,alias.default,forward,27,1,1,3,3052,3150,3 +3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8 +3145,alias_default_776,call_function,alias.default,forward,27,1,1,2,2,3147,2 +3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8 +3147,convert_element_type_663,call_function,convert_element_type.default,forward,27,1,1,1,3058,3144,6 +3148,dtype_cast_250,call_function,dtype_cast.default,forward,27,1,1,1,1,3144,3 +3149,permute_305,call_function,permute.default,forward,27,1,1,1,2,3143,3 +3150,alias_default_779,call_function,alias.default,forward,27,1,1,4,3059,3143,4 +3151,alias_default_780,call_function,alias.default,forward,27,1,1,2,3,3142,3 +3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5 +3153,alias_default_781,call_function,alias.default,forward,27,1,1,2,3065,3139,4 +3154,convert_element_type_666,call_function,convert_element_type.default,forward,27,1,1,1,3066,3127,4 +3155,alias_default_782,call_function,alias.default,forward,27,1,1,2,3067,3126,4 +3156,neg_27,call_function,neg.default,forward,27,1,1,1,3068,3125,8 +3157,exp_27,call_function,exp.default,forward,27,1,1,1,3069,3124,6 +3158,add_138,call_function,add.Tensor,forward,27,1,1,1,3070,3123,4 +3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6 +3160,convert_element_type_667,call_function,convert_element_type.default,forward,27,1,1,1,3072,3121,6 +3161,dtype_cast_251,call_function,dtype_cast.default,forward,27,1,1,1,1,3125,3 +3162,permute_306,call_function,permute.default,forward,27,1,1,1,2,3124,3 +3163,alias_default_784,call_function,alias.default,forward,27,1,1,2,3,3123,3 +3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5 +3165,alias_default_783,call_function,alias.default,forward,27,1,1,2,3073,3120,4 +3166,alias_default_785,call_function,alias.default,forward,27,1,1,2,3065,3120,4 +3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8 +3168,dtype_cast_252,call_function,dtype_cast.default,forward,27,1,1,1,1,3121,3 +3169,permute_307,call_function,permute.default,forward,27,1,1,1,2,3120,3 +3170,alias_default_786,call_function,alias.default,forward,27,1,1,2,3081,3118,4 +3171,alias_default_787,call_function,alias.default,forward,27,1,1,2,3,3119,3 +3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5 +3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10 +3174,dtype_cast_253,call_function,dtype_cast.default,forward,,1,1,1,1,3102,2 +3175,alias_default_788,call_function,alias.default,forward,27,1,1,2,3088,3115,4 +3176,convert_element_type_672,call_function,convert_element_type.default,forward,,1,1,1,3089,3113,4 +3177,alias_default_790,call_function,alias.default,forward,,1,1,2,3090,3112,4 +3178,pow_57,call_function,pow.Tensor_Scalar,forward,,1,1,1,3091,3111,4 +3179,mean_56,call_function,mean.dim,forward,,1,1,1,3092,3110,4 +3180,add_140,call_function,add.Scalar,forward,,1,1,1,3093,3109,3 +3181,rsqrt_56,call_function,rsqrt.default,forward,,1,1,1,3094,3108,3 +3182,alias_default_791,call_function,alias.default,forward,,1,1,3,3095,3107,3 +3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8 +3184,alias_default_789,call_function,alias.default,forward,,1,1,2,2,3101,2 +3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8 +3186,convert_element_type_673,call_function,convert_element_type.default,forward,,1,1,1,3101,8,6 +3187,dtype_cast_254,call_function,dtype_cast.default,forward,,1,1,1,2,3105,3 +3188,permute_308,call_function,permute.default,forward,,1,1,1,3,3104,3 +3189,alias_default_792,call_function,alias.default,forward,,1,1,2,3102,7,4 +3190,alias_default_793,call_function,alias.default,forward,,1,1,2,4,3103,3 +3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5 +3192,alias_default_1245,call_function,alias.default,forward,,1,1,0,3107,0,4 +3193,alias_default_3,call_function,alias.default,unknown,,1,1,2,1,3103,4 +3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5 +3195,permute_311,call_function,permute.default,backward,,1,1,1,5,3100,3 +3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5 +3197,permute_312,call_function,permute.default,backward,,1,1,1,3106,3,4 +3198,dtype_cast_255,call_function,dtype_cast.default,backward,,1,1,1,3107,2,4 +3199,convert_element_type_680,call_function,convert_element_type.default,backward,,1,1,1,9,3098,5 +3200,convert_element_type_681,call_function,convert_element_type.default,backward,,1,1,1,3089,3098,4 +3201,convert_element_type_682,call_function,convert_element_type.default,backward,,1,1,1,3,3092,2 +3202,alias_default_794,call_function,alias.default,backward,,1,1,2,10,3097,4 +3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8 +3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8 +3205,alias_default_795,call_function,alias.default,backward,,1,1,2,16,3090,4 +3206,alias_default_796,call_function,alias.default,backward,,1,1,3,3098,3096,4 +3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8 +3208,sum_1,call_function,sum.dim_IntList,backward,,1,1,1,3115,3088,5 +3209,div_28,call_function,div.Tensor,backward,,1,1,1,3099,3088,6 +3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8 +3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10 +3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8 +3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8 +3214,sum_2,call_function,sum.dim_IntList,backward,,1,1,1,3109,3,5 +3215,convert_element_type_683,call_function,convert_element_type.default,backward,,1,1,1,3120,3084,6 +3216,convert_element_type_684,call_function,convert_element_type.default,backward,,1,1,1,3110,2,3 +3217,dtype_cast_256,call_function,dtype_cast.default,backward,,1,1,1,3111,1,3 +3218,alias_default_1499,call_function,alias.default,backward,,1,1,0,3112,0,2 +3219,alias_default_797,call_function,alias.default,backward,,1,1,3,3121,3083,4 +3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5 +3221,permute_315,call_function,permute.default,backward,27,1,1,1,4,3079,3 +3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5 +3223,permute_316,call_function,permute.default,backward,27,1,1,1,3123,2,4 +3224,dtype_cast_257,call_function,dtype_cast.default,backward,27,1,1,1,3124,1,4 +3225,alias_default_1495,call_function,alias.default,backward,27,1,1,0,3125,0,3 +3226,alias_default_798,call_function,alias.default,backward,27,1,1,2,3124,3077,4 +3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8 +3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8 +3229,alias_default_799,call_function,alias.default,backward,27,1,1,2,3126,3064,4 +3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5 +3231,permute_319,call_function,permute.default,backward,27,1,1,1,4,3060,3 +3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5 +3233,permute_320,call_function,permute.default,backward,27,1,1,1,3128,2,4 +3234,dtype_cast_258,call_function,dtype_cast.default,backward,27,1,1,1,3129,1,4 +3235,alias_default_1496,call_function,alias.default,backward,27,1,1,0,3130,0,3 +3236,convert_element_type_693,call_function,convert_element_type.default,backward,27,1,1,1,3126,3068,6 +3237,convert_element_type_694,call_function,convert_element_type.default,backward,27,1,1,1,3066,3078,4 +3238,alias_default_800,call_function,alias.default,backward,27,1,1,2,3067,3077,4 +3239,neg_28,call_function,neg.default,backward,27,1,1,1,3068,3076,8 +3240,exp_28,call_function,exp.default,backward,27,1,1,1,3069,3075,6 +3241,add_141,call_function,add.Tensor,backward,27,1,1,1,3070,3074,4 +3242,reciprocal,call_function,reciprocal.default,backward,27,1,1,1,3071,3073,4 +3243,mul_206,call_function,mul.Tensor,backward,27,1,1,1,3072,3072,6 +3244,alias_default_801,call_function,alias.default,backward,27,1,1,2,3073,3071,4 +3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8 +3246,sub_1,call_function,sub.Tensor,backward,27,1,1,1,3074,3069,4 +3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8 +3248,add_142,call_function,add.Tensor,backward,27,1,1,1,3076,3067,4 +3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8 +3250,convert_element_type_695,call_function,convert_element_type.default,backward,27,1,1,1,3140,3065,6 +3251,alias_default_802,call_function,alias.default,backward,27,1,1,2,3141,3064,4 +3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5 +3253,permute_323,call_function,permute.default,backward,27,1,1,1,4,3060,3 +3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5 +3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10 +3256,permute_324,call_function,permute.default,backward,27,1,1,1,3143,2,4 +3257,dtype_cast_259,call_function,dtype_cast.default,backward,27,1,1,1,3144,1,4 +3258,alias_default_1494,call_function,alias.default,backward,27,1,1,0,3145,0,3 +3259,convert_element_type_700,call_function,convert_element_type.default,backward,27,1,1,1,3149,3057,8 +3260,convert_element_type_701,call_function,convert_element_type.default,backward,27,1,1,1,3046,3057,4 +3261,convert_element_type_702,call_function,convert_element_type.default,backward,27,1,1,1,3,3051,2 +3262,alias_default_803,call_function,alias.default,backward,27,1,1,2,3150,3056,4 +3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8 +3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8 +3265,alias_default_804,call_function,alias.default,backward,27,1,1,2,3153,3049,4 +3266,alias_default_805,call_function,alias.default,backward,27,1,1,3,3055,3055,4 +3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8 +3268,sum_3,call_function,sum.dim_IntList,backward,27,1,1,1,3158,3047,5 +3269,div_29,call_function,div.Tensor,backward,27,1,1,1,3056,3047,6 +3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8 +3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10 +3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8 +3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8 +3274,sum_4,call_function,sum.dim_IntList,backward,27,1,1,1,3155,3,5 +3275,convert_element_type_703,call_function,convert_element_type.default,backward,27,1,1,1,3163,3043,6 +3276,convert_element_type_704,call_function,convert_element_type.default,backward,27,1,1,1,3156,2,3 +3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10 +3278,dtype_cast_260,call_function,dtype_cast.default,backward,27,1,1,1,3157,1,3 +3279,alias_default_1498,call_function,alias.default,backward,27,1,1,0,3158,0,2 +3280,alias_default_806,call_function,alias.default,unknown,,1,1,3,3165,3041,4 +3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5 +3282,permute_327,call_function,permute.default,backward,27,1,1,1,4,3037,3 +3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5 +3284,permute_328,call_function,permute.default,backward,27,1,1,1,3167,2,4 +3285,dtype_cast_261,call_function,dtype_cast.default,backward,27,1,1,1,3168,1,4 +3286,alias_default_1493,call_function,alias.default,backward,27,1,1,0,3169,0,3 +3287,view_656,call_function,view.default,backward,27,1,1,1,3168,3035,4 +3288,permute_329,call_function,permute.default,backward,27,1,1,1,3169,3034,4 +3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2 +3290,getitem_252,call_function,getitem,backward,27,1,1,1,3174,3006,2 +3291,getitem_253,call_function,getitem,backward,27,1,1,1,3174,3007,2 +3292,getitem_254,call_function,getitem,backward,27,1,1,1,3174,3000,2 +3293,permute_330,call_function,permute.default,backward,27,1,1,1,3175,2999,2 +3294,permute_331,call_function,permute.default,backward,27,1,1,1,3175,3006,2 +3295,permute_332,call_function,permute.default,backward,27,1,1,1,3175,3005,2 +3296,convert_element_type_709,call_function,convert_element_type.default,backward,27,1,1,1,3176,3005,2 +3297,convert_element_type_710,call_function,convert_element_type.default,backward,27,1,1,1,3176,3004,2 +3298,view_657,call_function,view.default,backward,27,1,1,1,3177,3004,2 +3299,view_as_complex_56,call_function,view_as_complex.default,backward,27,1,1,1,3178,3003,6 +3300,_conj,call_function,_conj.default,backward,27,1,1,1,4,3004,3 +3301,clone_6,call_function,clone.default,backward,27,1,1,1,5,3003,3 +3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8 +3303,view_658,call_function,view.default,backward,27,1,1,1,3177,3003,2 +3304,view_as_complex_57,call_function,view_as_complex.default,backward,27,1,1,1,3178,3002,6 +3305,_conj_1,call_function,_conj.default,backward,27,1,1,1,4,3003,3 +3306,clone_7,call_function,clone.default,backward,27,1,1,1,5,3002,3 +3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8 +3308,view_as_real_56,call_function,view_as_real.default,backward,27,1,1,1,3182,3001,6 +3309,view_659,call_function,view.default,backward,27,1,1,1,3183,3000,6 +3310,convert_element_type_711,call_function,convert_element_type.default,backward,27,1,1,1,3184,2999,6 +3311,view_as_real_57,call_function,view_as_real.default,backward,27,1,1,1,3182,3000,6 +3312,view_660,call_function,view.default,backward,27,1,1,1,3183,2999,6 +3313,convert_element_type_712,call_function,convert_element_type.default,backward,27,1,1,1,3184,2998,6 +3314,view_661,call_function,view.default,backward,27,1,1,1,3176,2998,2 +3315,view_662,call_function,view.default,backward,27,1,1,1,3185,2998,5 +3316,view_663,call_function,view.default,backward,27,1,1,1,3185,2997,5 +3317,alias_default_807,call_function,alias.default,backward,27,1,1,2,3177,2997,4 +3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5 +3319,permute_335,call_function,permute.default,backward,27,1,1,1,4,2993,3 +3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5 +3321,permute_336,call_function,permute.default,backward,27,1,1,1,3179,2,4 +3322,dtype_cast_262,call_function,dtype_cast.default,backward,27,1,1,1,3180,1,4 +3323,alias_default_1492,call_function,alias.default,backward,27,1,1,0,3181,0,3 +3324,alias_default_808,call_function,alias.default,backward,27,1,1,2,3186,2997,4 +3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5 +3326,permute_339,call_function,permute.default,backward,27,1,1,1,4,2993,3 +3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5 +3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10 +3329,permute_340,call_function,permute.default,backward,27,1,1,1,3188,2,4 +3330,dtype_cast_263,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4 +3331,alias_default_1491,call_function,alias.default,backward,27,1,1,0,3190,0,3 +3332,alias_default_809,call_function,alias.default,backward,27,1,1,2,3186,2996,4 +3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5 +3334,permute_343,call_function,permute.default,backward,27,1,1,1,4,2992,3 +3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5 +3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10 +3337,permute_344,call_function,permute.default,backward,27,1,1,1,3188,2,4 +3338,dtype_cast_264,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4 +3339,alias_default_1490,call_function,alias.default,backward,27,1,1,0,3190,0,3 +3340,convert_element_type_725,call_function,convert_element_type.default,backward,27,1,1,1,3212,2989,8 +3341,convert_element_type_726,call_function,convert_element_type.default,backward,27,1,1,1,2979,2989,4 +3342,convert_element_type_727,call_function,convert_element_type.default,backward,27,1,1,1,3,2983,2 +3343,alias_default_810,call_function,alias.default,backward,27,1,1,2,3213,2988,4 +3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8 +3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8 +3346,alias_default_811,call_function,alias.default,backward,27,1,1,2,3216,2981,4 +3347,alias_default_812,call_function,alias.default,backward,27,1,1,3,2988,2987,4 +3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8 +3349,sum_5,call_function,sum.dim_IntList,backward,27,1,1,1,3221,2979,5 +3350,div_30,call_function,div.Tensor,backward,27,1,1,1,2989,2979,6 +3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8 +3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10 +3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8 +3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8 +3355,sum_6,call_function,sum.dim_IntList,backward,27,1,1,1,3218,3,5 +3356,convert_element_type_728,call_function,convert_element_type.default,backward,27,1,1,1,3226,2975,6 +3357,convert_element_type_729,call_function,convert_element_type.default,backward,27,1,1,1,3219,2,3 +3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10 +3359,dtype_cast_265,call_function,dtype_cast.default,backward,27,1,1,1,3220,1,3 +3360,alias_default_1497,call_function,alias.default,backward,27,1,1,0,3221,0,2 +3361,alias_default_813,call_function,alias.default,unknown,,1,1,3,3228,2973,4 +3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5 +3363,permute_347,call_function,permute.default,backward,26,1,1,1,4,2969,3 +3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5 +3365,permute_348,call_function,permute.default,backward,26,1,1,1,3230,2,4 +3366,dtype_cast_266,call_function,dtype_cast.default,backward,26,1,1,1,3231,1,4 +3367,alias_default_1486,call_function,alias.default,backward,26,1,1,0,3232,0,3 +3368,alias_default_814,call_function,alias.default,backward,26,1,1,2,3231,2967,4 +3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8 +3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8 +3371,alias_default_815,call_function,alias.default,backward,26,1,1,2,3233,2954,4 +3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5 +3373,permute_351,call_function,permute.default,backward,26,1,1,1,4,2950,3 +3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5 +3375,permute_352,call_function,permute.default,backward,26,1,1,1,3235,2,4 +3376,dtype_cast_267,call_function,dtype_cast.default,backward,26,1,1,1,3236,1,4 +3377,alias_default_1487,call_function,alias.default,backward,26,1,1,0,3237,0,3 +3378,convert_element_type_738,call_function,convert_element_type.default,backward,26,1,1,1,3233,2958,6 +3379,convert_element_type_739,call_function,convert_element_type.default,backward,26,1,1,1,2956,2968,4 +3380,alias_default_816,call_function,alias.default,backward,26,1,1,2,2957,2967,4 +3381,neg_29,call_function,neg.default,backward,26,1,1,1,2958,2966,8 +3382,exp_29,call_function,exp.default,backward,26,1,1,1,2959,2965,6 +3383,add_148,call_function,add.Tensor,backward,26,1,1,1,2960,2964,4 +3384,reciprocal_1,call_function,reciprocal.default,backward,26,1,1,1,2961,2963,4 +3385,mul_226,call_function,mul.Tensor,backward,26,1,1,1,2962,2962,6 +3386,alias_default_817,call_function,alias.default,backward,26,1,1,2,2963,2961,4 +3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8 +3388,sub_4,call_function,sub.Tensor,backward,26,1,1,1,2964,2959,4 +3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8 +3390,add_149,call_function,add.Tensor,backward,26,1,1,1,2966,2957,4 +3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8 +3392,convert_element_type_740,call_function,convert_element_type.default,backward,26,1,1,1,3247,2955,6 +3393,alias_default_818,call_function,alias.default,backward,26,1,1,2,3248,2954,4 +3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5 +3395,permute_355,call_function,permute.default,backward,26,1,1,1,4,2950,3 +3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5 +3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10 +3398,permute_356,call_function,permute.default,backward,26,1,1,1,3250,2,4 +3399,dtype_cast_268,call_function,dtype_cast.default,backward,26,1,1,1,3251,1,4 +3400,alias_default_1485,call_function,alias.default,backward,26,1,1,0,3252,0,3 +3401,convert_element_type_745,call_function,convert_element_type.default,backward,26,1,1,1,3256,2947,8 +3402,convert_element_type_746,call_function,convert_element_type.default,backward,26,1,1,1,2936,2947,4 +3403,convert_element_type_747,call_function,convert_element_type.default,backward,26,1,1,1,3,2941,2 +3404,alias_default_819,call_function,alias.default,backward,26,1,1,2,3257,2946,4 +3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8 +3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8 +3407,alias_default_820,call_function,alias.default,backward,26,1,1,2,3260,2939,4 +3408,alias_default_821,call_function,alias.default,backward,26,1,1,3,2945,2945,4 +3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8 +3410,sum_7,call_function,sum.dim_IntList,backward,26,1,1,1,3265,2937,5 +3411,div_31,call_function,div.Tensor,backward,26,1,1,1,2946,2937,6 +3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8 +3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10 +3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8 +3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8 +3416,sum_8,call_function,sum.dim_IntList,backward,26,1,1,1,3262,3,5 +3417,convert_element_type_748,call_function,convert_element_type.default,backward,26,1,1,1,3270,2933,6 +3418,convert_element_type_749,call_function,convert_element_type.default,backward,26,1,1,1,3263,2,3 +3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10 +3420,dtype_cast_269,call_function,dtype_cast.default,backward,26,1,1,1,3264,1,3 +3421,alias_default_1489,call_function,alias.default,backward,26,1,1,0,3265,0,2 +3422,alias_default_822,call_function,alias.default,unknown,,1,1,3,3272,2931,4 +3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5 +3424,permute_359,call_function,permute.default,backward,26,1,1,1,4,2927,3 +3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5 +3426,permute_360,call_function,permute.default,backward,26,1,1,1,3274,2,4 +3427,dtype_cast_270,call_function,dtype_cast.default,backward,26,1,1,1,3275,1,4 +3428,alias_default_1484,call_function,alias.default,backward,26,1,1,0,3276,0,3 +3429,view_678,call_function,view.default,backward,26,1,1,1,3275,2925,4 +3430,permute_361,call_function,permute.default,backward,26,1,1,1,3276,2924,4 +3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2 +3432,getitem_255,call_function,getitem,backward,26,1,1,1,3281,2896,2 +3433,getitem_256,call_function,getitem,backward,26,1,1,1,3281,2897,2 +3434,getitem_257,call_function,getitem,backward,26,1,1,1,3281,2890,2 +3435,permute_362,call_function,permute.default,backward,26,1,1,1,3282,2889,2 +3436,permute_363,call_function,permute.default,backward,26,1,1,1,3282,2896,2 +3437,permute_364,call_function,permute.default,backward,26,1,1,1,3282,2895,2 +3438,convert_element_type_754,call_function,convert_element_type.default,backward,26,1,1,1,3283,2895,2 +3439,convert_element_type_755,call_function,convert_element_type.default,backward,26,1,1,1,3283,2894,2 +3440,view_679,call_function,view.default,backward,26,1,1,1,3284,2894,2 +3441,view_as_complex_58,call_function,view_as_complex.default,backward,26,1,1,1,3285,2893,6 +3442,_conj_2,call_function,_conj.default,backward,26,1,1,1,4,2894,3 +3443,clone_14,call_function,clone.default,backward,26,1,1,1,5,2893,3 +3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8 +3445,view_680,call_function,view.default,backward,26,1,1,1,3284,2893,2 +3446,view_as_complex_59,call_function,view_as_complex.default,backward,26,1,1,1,3285,2892,6 +3447,_conj_3,call_function,_conj.default,backward,26,1,1,1,4,2893,3 +3448,clone_15,call_function,clone.default,backward,26,1,1,1,5,2892,3 +3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8 +3450,view_as_real_58,call_function,view_as_real.default,backward,26,1,1,1,3289,2891,6 +3451,view_681,call_function,view.default,backward,26,1,1,1,3290,2890,6 +3452,convert_element_type_756,call_function,convert_element_type.default,backward,26,1,1,1,3291,2889,6 +3453,view_as_real_59,call_function,view_as_real.default,backward,26,1,1,1,3289,2890,6 +3454,view_682,call_function,view.default,backward,26,1,1,1,3290,2889,6 +3455,convert_element_type_757,call_function,convert_element_type.default,backward,26,1,1,1,3291,2888,6 +3456,view_683,call_function,view.default,backward,26,1,1,1,3283,2888,2 +3457,view_684,call_function,view.default,backward,26,1,1,1,3292,2888,5 +3458,view_685,call_function,view.default,backward,26,1,1,1,3292,2887,5 +3459,alias_default_823,call_function,alias.default,backward,26,1,1,2,3284,2887,4 +3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5 +3461,permute_367,call_function,permute.default,backward,26,1,1,1,4,2883,3 +3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5 +3463,permute_368,call_function,permute.default,backward,26,1,1,1,3286,2,4 +3464,dtype_cast_271,call_function,dtype_cast.default,backward,26,1,1,1,3287,1,4 +3465,alias_default_1483,call_function,alias.default,backward,26,1,1,0,3288,0,3 +3466,alias_default_824,call_function,alias.default,backward,26,1,1,2,3293,2887,4 +3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5 +3468,permute_371,call_function,permute.default,backward,26,1,1,1,4,2883,3 +3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5 +3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10 +3471,permute_372,call_function,permute.default,backward,26,1,1,1,3295,2,4 +3472,dtype_cast_272,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4 +3473,alias_default_1482,call_function,alias.default,backward,26,1,1,0,3297,0,3 +3474,alias_default_825,call_function,alias.default,backward,26,1,1,2,3293,2886,4 +3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5 +3476,permute_375,call_function,permute.default,backward,26,1,1,1,4,2882,3 +3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5 +3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10 +3479,permute_376,call_function,permute.default,backward,26,1,1,1,3295,2,4 +3480,dtype_cast_273,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4 +3481,alias_default_1481,call_function,alias.default,backward,26,1,1,0,3297,0,3 +3482,convert_element_type_770,call_function,convert_element_type.default,backward,26,1,1,1,3319,2879,8 +3483,convert_element_type_771,call_function,convert_element_type.default,backward,26,1,1,1,2869,2879,4 +3484,convert_element_type_772,call_function,convert_element_type.default,backward,26,1,1,1,3,2873,2 +3485,alias_default_826,call_function,alias.default,backward,26,1,1,2,3320,2878,4 +3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8 +3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8 +3488,alias_default_827,call_function,alias.default,backward,26,1,1,2,3323,2871,4 +3489,alias_default_828,call_function,alias.default,backward,26,1,1,3,2878,2877,4 +3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8 +3491,sum_9,call_function,sum.dim_IntList,backward,26,1,1,1,3328,2869,5 +3492,div_32,call_function,div.Tensor,backward,26,1,1,1,2879,2869,6 +3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8 +3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10 +3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8 +3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8 +3497,sum_10,call_function,sum.dim_IntList,backward,26,1,1,1,3325,3,5 +3498,convert_element_type_773,call_function,convert_element_type.default,backward,26,1,1,1,3333,2865,6 +3499,convert_element_type_774,call_function,convert_element_type.default,backward,26,1,1,1,3326,2,3 +3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10 +3501,dtype_cast_274,call_function,dtype_cast.default,backward,26,1,1,1,3327,1,3 +3502,alias_default_1488,call_function,alias.default,backward,26,1,1,0,3328,0,2 +3503,alias_default_829,call_function,alias.default,unknown,,1,1,3,3335,2863,4 +3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5 +3505,permute_379,call_function,permute.default,backward,25,1,1,1,4,2859,3 +3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5 +3507,permute_380,call_function,permute.default,backward,25,1,1,1,3337,2,4 +3508,dtype_cast_275,call_function,dtype_cast.default,backward,25,1,1,1,3338,1,4 +3509,alias_default_1477,call_function,alias.default,backward,25,1,1,0,3339,0,3 +3510,alias_default_830,call_function,alias.default,backward,25,1,1,2,3338,2857,4 +3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8 +3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8 +3513,alias_default_831,call_function,alias.default,backward,25,1,1,2,3340,2844,4 +3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5 +3515,permute_383,call_function,permute.default,backward,25,1,1,1,4,2840,3 +3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5 +3517,permute_384,call_function,permute.default,backward,25,1,1,1,3342,2,4 +3518,dtype_cast_276,call_function,dtype_cast.default,backward,25,1,1,1,3343,1,4 +3519,alias_default_1478,call_function,alias.default,backward,25,1,1,0,3344,0,3 +3520,convert_element_type_783,call_function,convert_element_type.default,backward,25,1,1,1,3340,2848,6 +3521,convert_element_type_784,call_function,convert_element_type.default,backward,25,1,1,1,2846,2858,4 +3522,alias_default_832,call_function,alias.default,backward,25,1,1,2,2847,2857,4 +3523,neg_30,call_function,neg.default,backward,25,1,1,1,2848,2856,8 +3524,exp_30,call_function,exp.default,backward,25,1,1,1,2849,2855,6 +3525,add_155,call_function,add.Tensor,backward,25,1,1,1,2850,2854,4 +3526,reciprocal_2,call_function,reciprocal.default,backward,25,1,1,1,2851,2853,4 +3527,mul_246,call_function,mul.Tensor,backward,25,1,1,1,2852,2852,6 +3528,alias_default_833,call_function,alias.default,backward,25,1,1,2,2853,2851,4 +3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8 +3530,sub_7,call_function,sub.Tensor,backward,25,1,1,1,2854,2849,4 +3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8 +3532,add_156,call_function,add.Tensor,backward,25,1,1,1,2856,2847,4 +3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8 +3534,convert_element_type_785,call_function,convert_element_type.default,backward,25,1,1,1,3354,2845,6 +3535,alias_default_834,call_function,alias.default,backward,25,1,1,2,3355,2844,4 +3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5 +3537,permute_387,call_function,permute.default,backward,25,1,1,1,4,2840,3 +3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5 +3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10 +3540,permute_388,call_function,permute.default,backward,25,1,1,1,3357,2,4 +3541,dtype_cast_277,call_function,dtype_cast.default,backward,25,1,1,1,3358,1,4 +3542,alias_default_1476,call_function,alias.default,backward,25,1,1,0,3359,0,3 +3543,convert_element_type_790,call_function,convert_element_type.default,backward,25,1,1,1,3363,2837,8 +3544,convert_element_type_791,call_function,convert_element_type.default,backward,25,1,1,1,2826,2837,4 +3545,convert_element_type_792,call_function,convert_element_type.default,backward,25,1,1,1,3,2831,2 +3546,alias_default_835,call_function,alias.default,backward,25,1,1,2,3364,2836,4 +3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8 +3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8 +3549,alias_default_836,call_function,alias.default,backward,25,1,1,2,3367,2829,4 +3550,alias_default_837,call_function,alias.default,backward,25,1,1,3,2835,2835,4 +3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8 +3552,sum_11,call_function,sum.dim_IntList,backward,25,1,1,1,3372,2827,5 +3553,div_33,call_function,div.Tensor,backward,25,1,1,1,2836,2827,6 +3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8 +3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10 +3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8 +3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8 +3558,sum_12,call_function,sum.dim_IntList,backward,25,1,1,1,3369,3,5 +3559,convert_element_type_793,call_function,convert_element_type.default,backward,25,1,1,1,3377,2823,6 +3560,convert_element_type_794,call_function,convert_element_type.default,backward,25,1,1,1,3370,2,3 +3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10 +3562,dtype_cast_278,call_function,dtype_cast.default,backward,25,1,1,1,3371,1,3 +3563,alias_default_1480,call_function,alias.default,backward,25,1,1,0,3372,0,2 +3564,alias_default_838,call_function,alias.default,unknown,,1,1,3,3379,2821,4 +3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5 +3566,permute_391,call_function,permute.default,backward,25,1,1,1,4,2817,3 +3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5 +3568,permute_392,call_function,permute.default,backward,25,1,1,1,3381,2,4 +3569,dtype_cast_279,call_function,dtype_cast.default,backward,25,1,1,1,3382,1,4 +3570,alias_default_1475,call_function,alias.default,backward,25,1,1,0,3383,0,3 +3571,view_700,call_function,view.default,backward,25,1,1,1,3382,2815,4 +3572,permute_393,call_function,permute.default,backward,25,1,1,1,3383,2814,4 +3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2 +3574,getitem_258,call_function,getitem,backward,25,1,1,1,3388,2786,2 +3575,getitem_259,call_function,getitem,backward,25,1,1,1,3388,2787,2 +3576,getitem_260,call_function,getitem,backward,25,1,1,1,3388,2780,2 +3577,permute_394,call_function,permute.default,backward,25,1,1,1,3389,2779,2 +3578,permute_395,call_function,permute.default,backward,25,1,1,1,3389,2786,2 +3579,permute_396,call_function,permute.default,backward,25,1,1,1,3389,2785,2 +3580,convert_element_type_799,call_function,convert_element_type.default,backward,25,1,1,1,3390,2785,2 +3581,convert_element_type_800,call_function,convert_element_type.default,backward,25,1,1,1,3390,2784,2 +3582,view_701,call_function,view.default,backward,25,1,1,1,3391,2784,2 +3583,view_as_complex_60,call_function,view_as_complex.default,backward,25,1,1,1,3392,2783,6 +3584,_conj_4,call_function,_conj.default,backward,25,1,1,1,4,2784,3 +3585,clone_22,call_function,clone.default,backward,25,1,1,1,5,2783,3 +3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8 +3587,view_702,call_function,view.default,backward,25,1,1,1,3391,2783,2 +3588,view_as_complex_61,call_function,view_as_complex.default,backward,25,1,1,1,3392,2782,6 +3589,_conj_5,call_function,_conj.default,backward,25,1,1,1,4,2783,3 +3590,clone_23,call_function,clone.default,backward,25,1,1,1,5,2782,3 +3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8 +3592,view_as_real_60,call_function,view_as_real.default,backward,25,1,1,1,3396,2781,6 +3593,view_703,call_function,view.default,backward,25,1,1,1,3397,2780,6 +3594,convert_element_type_801,call_function,convert_element_type.default,backward,25,1,1,1,3398,2779,6 +3595,view_as_real_61,call_function,view_as_real.default,backward,25,1,1,1,3396,2780,6 +3596,view_704,call_function,view.default,backward,25,1,1,1,3397,2779,6 +3597,convert_element_type_802,call_function,convert_element_type.default,backward,25,1,1,1,3398,2778,6 +3598,view_705,call_function,view.default,backward,25,1,1,1,3390,2778,2 +3599,view_706,call_function,view.default,backward,25,1,1,1,3399,2778,5 +3600,view_707,call_function,view.default,backward,25,1,1,1,3399,2777,5 +3601,alias_default_839,call_function,alias.default,backward,25,1,1,2,3391,2777,4 +3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5 +3603,permute_399,call_function,permute.default,backward,25,1,1,1,4,2773,3 +3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5 +3605,permute_400,call_function,permute.default,backward,25,1,1,1,3393,2,4 +3606,dtype_cast_280,call_function,dtype_cast.default,backward,25,1,1,1,3394,1,4 +3607,alias_default_1474,call_function,alias.default,backward,25,1,1,0,3395,0,3 +3608,alias_default_840,call_function,alias.default,backward,25,1,1,2,3400,2777,4 +3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5 +3610,permute_403,call_function,permute.default,backward,25,1,1,1,4,2773,3 +3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5 +3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10 +3613,permute_404,call_function,permute.default,backward,25,1,1,1,3402,2,4 +3614,dtype_cast_281,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4 +3615,alias_default_1473,call_function,alias.default,backward,25,1,1,0,3404,0,3 +3616,alias_default_841,call_function,alias.default,backward,25,1,1,2,3400,2776,4 +3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5 +3618,permute_407,call_function,permute.default,backward,25,1,1,1,4,2772,3 +3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5 +3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10 +3621,permute_408,call_function,permute.default,backward,25,1,1,1,3402,2,4 +3622,dtype_cast_282,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4 +3623,alias_default_1472,call_function,alias.default,backward,25,1,1,0,3404,0,3 +3624,convert_element_type_815,call_function,convert_element_type.default,backward,25,1,1,1,3426,2769,8 +3625,convert_element_type_816,call_function,convert_element_type.default,backward,25,1,1,1,2759,2769,4 +3626,convert_element_type_817,call_function,convert_element_type.default,backward,25,1,1,1,3,2763,2 +3627,alias_default_842,call_function,alias.default,backward,25,1,1,2,3427,2768,4 +3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8 +3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8 +3630,alias_default_843,call_function,alias.default,backward,25,1,1,2,3430,2761,4 +3631,alias_default_844,call_function,alias.default,backward,25,1,1,3,2768,2767,4 +3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8 +3633,sum_13,call_function,sum.dim_IntList,backward,25,1,1,1,3435,2759,5 +3634,div_34,call_function,div.Tensor,backward,25,1,1,1,2769,2759,6 +3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8 +3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10 +3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8 +3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8 +3639,sum_14,call_function,sum.dim_IntList,backward,25,1,1,1,3432,3,5 +3640,convert_element_type_818,call_function,convert_element_type.default,backward,25,1,1,1,3440,2755,6 +3641,convert_element_type_819,call_function,convert_element_type.default,backward,25,1,1,1,3433,2,3 +3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10 +3643,dtype_cast_283,call_function,dtype_cast.default,backward,25,1,1,1,3434,1,3 +3644,alias_default_1479,call_function,alias.default,backward,25,1,1,0,3435,0,2 +3645,alias_default_845,call_function,alias.default,unknown,,1,1,3,3442,2753,4 +3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5 +3647,permute_411,call_function,permute.default,backward,24,1,1,1,4,2749,3 +3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5 +3649,permute_412,call_function,permute.default,backward,24,1,1,1,3444,2,4 +3650,dtype_cast_284,call_function,dtype_cast.default,backward,24,1,1,1,3445,1,4 +3651,alias_default_1468,call_function,alias.default,backward,24,1,1,0,3446,0,3 +3652,alias_default_846,call_function,alias.default,backward,24,1,1,2,3445,2747,4 +3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8 +3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8 +3655,alias_default_847,call_function,alias.default,backward,24,1,1,2,3447,2734,4 +3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5 +3657,permute_415,call_function,permute.default,backward,24,1,1,1,4,2730,3 +3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5 +3659,permute_416,call_function,permute.default,backward,24,1,1,1,3449,2,4 +3660,dtype_cast_285,call_function,dtype_cast.default,backward,24,1,1,1,3450,1,4 +3661,alias_default_1469,call_function,alias.default,backward,24,1,1,0,3451,0,3 +3662,convert_element_type_828,call_function,convert_element_type.default,backward,24,1,1,1,3447,2738,6 +3663,convert_element_type_829,call_function,convert_element_type.default,backward,24,1,1,1,2736,2748,4 +3664,alias_default_848,call_function,alias.default,backward,24,1,1,2,2737,2747,4 +3665,neg_31,call_function,neg.default,backward,24,1,1,1,2738,2746,8 +3666,exp_31,call_function,exp.default,backward,24,1,1,1,2739,2745,6 +3667,add_162,call_function,add.Tensor,backward,24,1,1,1,2740,2744,4 +3668,reciprocal_3,call_function,reciprocal.default,backward,24,1,1,1,2741,2743,4 +3669,mul_266,call_function,mul.Tensor,backward,24,1,1,1,2742,2742,6 +3670,alias_default_849,call_function,alias.default,backward,24,1,1,2,2743,2741,4 +3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8 +3672,sub_10,call_function,sub.Tensor,backward,24,1,1,1,2744,2739,4 +3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8 +3674,add_163,call_function,add.Tensor,backward,24,1,1,1,2746,2737,4 +3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8 +3676,convert_element_type_830,call_function,convert_element_type.default,backward,24,1,1,1,3461,2735,6 +3677,alias_default_850,call_function,alias.default,backward,24,1,1,2,3462,2734,4 +3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5 +3679,permute_419,call_function,permute.default,backward,24,1,1,1,4,2730,3 +3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5 +3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10 +3682,permute_420,call_function,permute.default,backward,24,1,1,1,3464,2,4 +3683,dtype_cast_286,call_function,dtype_cast.default,backward,24,1,1,1,3465,1,4 +3684,alias_default_1467,call_function,alias.default,backward,24,1,1,0,3466,0,3 +3685,convert_element_type_835,call_function,convert_element_type.default,backward,24,1,1,1,3470,2727,8 +3686,convert_element_type_836,call_function,convert_element_type.default,backward,24,1,1,1,2716,2727,4 +3687,convert_element_type_837,call_function,convert_element_type.default,backward,24,1,1,1,3,2721,2 +3688,alias_default_851,call_function,alias.default,backward,24,1,1,2,3471,2726,4 +3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8 +3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8 +3691,alias_default_852,call_function,alias.default,backward,24,1,1,2,3474,2719,4 +3692,alias_default_853,call_function,alias.default,backward,24,1,1,3,2725,2725,4 +3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8 +3694,sum_15,call_function,sum.dim_IntList,backward,24,1,1,1,3479,2717,5 +3695,div_35,call_function,div.Tensor,backward,24,1,1,1,2726,2717,6 +3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8 +3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10 +3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8 +3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8 +3700,sum_16,call_function,sum.dim_IntList,backward,24,1,1,1,3476,3,5 +3701,convert_element_type_838,call_function,convert_element_type.default,backward,24,1,1,1,3484,2713,6 +3702,convert_element_type_839,call_function,convert_element_type.default,backward,24,1,1,1,3477,2,3 +3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10 +3704,dtype_cast_287,call_function,dtype_cast.default,backward,24,1,1,1,3478,1,3 +3705,alias_default_1471,call_function,alias.default,backward,24,1,1,0,3479,0,2 +3706,alias_default_854,call_function,alias.default,unknown,,1,1,3,3486,2711,4 +3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5 +3708,permute_423,call_function,permute.default,backward,24,1,1,1,4,2707,3 +3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5 +3710,permute_424,call_function,permute.default,backward,24,1,1,1,3488,2,4 +3711,dtype_cast_288,call_function,dtype_cast.default,backward,24,1,1,1,3489,1,4 +3712,alias_default_1466,call_function,alias.default,backward,24,1,1,0,3490,0,3 +3713,view_722,call_function,view.default,backward,24,1,1,1,3489,2705,4 +3714,permute_425,call_function,permute.default,backward,24,1,1,1,3490,2704,4 +3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2 +3716,getitem_261,call_function,getitem,backward,24,1,1,1,3495,2676,2 +3717,getitem_262,call_function,getitem,backward,24,1,1,1,3495,2677,2 +3718,getitem_263,call_function,getitem,backward,24,1,1,1,3495,2670,2 +3719,permute_426,call_function,permute.default,backward,24,1,1,1,3496,2669,2 +3720,permute_427,call_function,permute.default,backward,24,1,1,1,3496,2676,2 +3721,permute_428,call_function,permute.default,backward,24,1,1,1,3496,2675,2 +3722,convert_element_type_844,call_function,convert_element_type.default,backward,24,1,1,1,3497,2675,2 +3723,convert_element_type_845,call_function,convert_element_type.default,backward,24,1,1,1,3497,2674,2 +3724,view_723,call_function,view.default,backward,24,1,1,1,3498,2674,2 +3725,view_as_complex_62,call_function,view_as_complex.default,backward,24,1,1,1,3499,2673,6 +3726,_conj_6,call_function,_conj.default,backward,24,1,1,1,4,2674,3 +3727,clone_30,call_function,clone.default,backward,24,1,1,1,5,2673,3 +3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8 +3729,view_724,call_function,view.default,backward,24,1,1,1,3498,2673,2 +3730,view_as_complex_63,call_function,view_as_complex.default,backward,24,1,1,1,3499,2672,6 +3731,_conj_7,call_function,_conj.default,backward,24,1,1,1,4,2673,3 +3732,clone_31,call_function,clone.default,backward,24,1,1,1,5,2672,3 +3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8 +3734,view_as_real_62,call_function,view_as_real.default,backward,24,1,1,1,3503,2671,6 +3735,view_725,call_function,view.default,backward,24,1,1,1,3504,2670,6 +3736,convert_element_type_846,call_function,convert_element_type.default,backward,24,1,1,1,3505,2669,6 +3737,view_as_real_63,call_function,view_as_real.default,backward,24,1,1,1,3503,2670,6 +3738,view_726,call_function,view.default,backward,24,1,1,1,3504,2669,6 +3739,convert_element_type_847,call_function,convert_element_type.default,backward,24,1,1,1,3505,2668,6 +3740,view_727,call_function,view.default,backward,24,1,1,1,3497,2668,2 +3741,view_728,call_function,view.default,backward,24,1,1,1,3506,2668,5 +3742,view_729,call_function,view.default,backward,24,1,1,1,3506,2667,5 +3743,alias_default_855,call_function,alias.default,backward,24,1,1,2,3498,2667,4 +3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5 +3745,permute_431,call_function,permute.default,backward,24,1,1,1,4,2663,3 +3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5 +3747,permute_432,call_function,permute.default,backward,24,1,1,1,3500,2,4 +3748,dtype_cast_289,call_function,dtype_cast.default,backward,24,1,1,1,3501,1,4 +3749,alias_default_1465,call_function,alias.default,backward,24,1,1,0,3502,0,3 +3750,alias_default_856,call_function,alias.default,backward,24,1,1,2,3507,2667,4 +3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5 +3752,permute_435,call_function,permute.default,backward,24,1,1,1,4,2663,3 +3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5 +3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10 +3755,permute_436,call_function,permute.default,backward,24,1,1,1,3509,2,4 +3756,dtype_cast_290,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4 +3757,alias_default_1464,call_function,alias.default,backward,24,1,1,0,3511,0,3 +3758,alias_default_857,call_function,alias.default,backward,24,1,1,2,3507,2666,4 +3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5 +3760,permute_439,call_function,permute.default,backward,24,1,1,1,4,2662,3 +3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5 +3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10 +3763,permute_440,call_function,permute.default,backward,24,1,1,1,3509,2,4 +3764,dtype_cast_291,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4 +3765,alias_default_1463,call_function,alias.default,backward,24,1,1,0,3511,0,3 +3766,convert_element_type_860,call_function,convert_element_type.default,backward,24,1,1,1,3533,2659,8 +3767,convert_element_type_861,call_function,convert_element_type.default,backward,24,1,1,1,2649,2659,4 +3768,convert_element_type_862,call_function,convert_element_type.default,backward,24,1,1,1,3,2653,2 +3769,alias_default_858,call_function,alias.default,backward,24,1,1,2,3534,2658,4 +3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8 +3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8 +3772,alias_default_859,call_function,alias.default,backward,24,1,1,2,3537,2651,4 +3773,alias_default_860,call_function,alias.default,backward,24,1,1,3,2658,2657,4 +3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8 +3775,sum_17,call_function,sum.dim_IntList,backward,24,1,1,1,3542,2649,5 +3776,div_36,call_function,div.Tensor,backward,24,1,1,1,2659,2649,6 +3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8 +3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10 +3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8 +3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8 +3781,sum_18,call_function,sum.dim_IntList,backward,24,1,1,1,3539,3,5 +3782,convert_element_type_863,call_function,convert_element_type.default,backward,24,1,1,1,3547,2645,6 +3783,convert_element_type_864,call_function,convert_element_type.default,backward,24,1,1,1,3540,2,3 +3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10 +3785,dtype_cast_292,call_function,dtype_cast.default,backward,24,1,1,1,3541,1,3 +3786,alias_default_1470,call_function,alias.default,backward,24,1,1,0,3542,0,2 +3787,alias_default_861,call_function,alias.default,unknown,,1,1,3,3549,2643,4 +3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5 +3789,permute_443,call_function,permute.default,backward,23,1,1,1,4,2639,3 +3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5 +3791,permute_444,call_function,permute.default,backward,23,1,1,1,3551,2,4 +3792,dtype_cast_293,call_function,dtype_cast.default,backward,23,1,1,1,3552,1,4 +3793,alias_default_1459,call_function,alias.default,backward,23,1,1,0,3553,0,3 +3794,alias_default_862,call_function,alias.default,backward,23,1,1,2,3552,2637,4 +3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8 +3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8 +3797,alias_default_863,call_function,alias.default,backward,23,1,1,2,3554,2624,4 +3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5 +3799,permute_447,call_function,permute.default,backward,23,1,1,1,4,2620,3 +3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5 +3801,permute_448,call_function,permute.default,backward,23,1,1,1,3556,2,4 +3802,dtype_cast_294,call_function,dtype_cast.default,backward,23,1,1,1,3557,1,4 +3803,alias_default_1460,call_function,alias.default,backward,23,1,1,0,3558,0,3 +3804,convert_element_type_873,call_function,convert_element_type.default,backward,23,1,1,1,3554,2628,6 +3805,convert_element_type_874,call_function,convert_element_type.default,backward,23,1,1,1,2626,2638,4 +3806,alias_default_864,call_function,alias.default,backward,23,1,1,2,2627,2637,4 +3807,neg_32,call_function,neg.default,backward,23,1,1,1,2628,2636,8 +3808,exp_32,call_function,exp.default,backward,23,1,1,1,2629,2635,6 +3809,add_169,call_function,add.Tensor,backward,23,1,1,1,2630,2634,4 +3810,reciprocal_4,call_function,reciprocal.default,backward,23,1,1,1,2631,2633,4 +3811,mul_286,call_function,mul.Tensor,backward,23,1,1,1,2632,2632,6 +3812,alias_default_865,call_function,alias.default,backward,23,1,1,2,2633,2631,4 +3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8 +3814,sub_13,call_function,sub.Tensor,backward,23,1,1,1,2634,2629,4 +3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8 +3816,add_170,call_function,add.Tensor,backward,23,1,1,1,2636,2627,4 +3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8 +3818,convert_element_type_875,call_function,convert_element_type.default,backward,23,1,1,1,3568,2625,6 +3819,alias_default_866,call_function,alias.default,backward,23,1,1,2,3569,2624,4 +3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5 +3821,permute_451,call_function,permute.default,backward,23,1,1,1,4,2620,3 +3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5 +3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10 +3824,permute_452,call_function,permute.default,backward,23,1,1,1,3571,2,4 +3825,dtype_cast_295,call_function,dtype_cast.default,backward,23,1,1,1,3572,1,4 +3826,alias_default_1458,call_function,alias.default,backward,23,1,1,0,3573,0,3 +3827,convert_element_type_880,call_function,convert_element_type.default,backward,23,1,1,1,3577,2617,8 +3828,convert_element_type_881,call_function,convert_element_type.default,backward,23,1,1,1,2606,2617,4 +3829,convert_element_type_882,call_function,convert_element_type.default,backward,23,1,1,1,3,2611,2 +3830,alias_default_867,call_function,alias.default,backward,23,1,1,2,3578,2616,4 +3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8 +3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8 +3833,alias_default_868,call_function,alias.default,backward,23,1,1,2,3581,2609,4 +3834,alias_default_869,call_function,alias.default,backward,23,1,1,3,2615,2615,4 +3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8 +3836,sum_19,call_function,sum.dim_IntList,backward,23,1,1,1,3586,2607,5 +3837,div_37,call_function,div.Tensor,backward,23,1,1,1,2616,2607,6 +3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8 +3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10 +3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8 +3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8 +3842,sum_20,call_function,sum.dim_IntList,backward,23,1,1,1,3583,3,5 +3843,convert_element_type_883,call_function,convert_element_type.default,backward,23,1,1,1,3591,2603,6 +3844,convert_element_type_884,call_function,convert_element_type.default,backward,23,1,1,1,3584,2,3 +3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10 +3846,dtype_cast_296,call_function,dtype_cast.default,backward,23,1,1,1,3585,1,3 +3847,alias_default_1462,call_function,alias.default,backward,23,1,1,0,3586,0,2 +3848,alias_default_870,call_function,alias.default,unknown,,1,1,3,3593,2601,4 +3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5 +3850,permute_455,call_function,permute.default,backward,23,1,1,1,4,2597,3 +3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5 +3852,permute_456,call_function,permute.default,backward,23,1,1,1,3595,2,4 +3853,dtype_cast_297,call_function,dtype_cast.default,backward,23,1,1,1,3596,1,4 +3854,alias_default_1457,call_function,alias.default,backward,23,1,1,0,3597,0,3 +3855,view_744,call_function,view.default,backward,23,1,1,1,3596,2595,4 +3856,permute_457,call_function,permute.default,backward,23,1,1,1,3597,2594,4 +3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2 +3858,getitem_264,call_function,getitem,backward,23,1,1,1,3602,2566,2 +3859,getitem_265,call_function,getitem,backward,23,1,1,1,3602,2567,2 +3860,getitem_266,call_function,getitem,backward,23,1,1,1,3602,2560,2 +3861,permute_458,call_function,permute.default,backward,23,1,1,1,3603,2559,2 +3862,permute_459,call_function,permute.default,backward,23,1,1,1,3603,2566,2 +3863,permute_460,call_function,permute.default,backward,23,1,1,1,3603,2565,2 +3864,convert_element_type_889,call_function,convert_element_type.default,backward,23,1,1,1,3604,2565,2 +3865,convert_element_type_890,call_function,convert_element_type.default,backward,23,1,1,1,3604,2564,2 +3866,view_745,call_function,view.default,backward,23,1,1,1,3605,2564,2 +3867,view_as_complex_64,call_function,view_as_complex.default,backward,23,1,1,1,3606,2563,6 +3868,_conj_8,call_function,_conj.default,backward,23,1,1,1,4,2564,3 +3869,clone_38,call_function,clone.default,backward,23,1,1,1,5,2563,3 +3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8 +3871,view_746,call_function,view.default,backward,23,1,1,1,3605,2563,2 +3872,view_as_complex_65,call_function,view_as_complex.default,backward,23,1,1,1,3606,2562,6 +3873,_conj_9,call_function,_conj.default,backward,23,1,1,1,4,2563,3 +3874,clone_39,call_function,clone.default,backward,23,1,1,1,5,2562,3 +3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8 +3876,view_as_real_64,call_function,view_as_real.default,backward,23,1,1,1,3610,2561,6 +3877,view_747,call_function,view.default,backward,23,1,1,1,3611,2560,6 +3878,convert_element_type_891,call_function,convert_element_type.default,backward,23,1,1,1,3612,2559,6 +3879,view_as_real_65,call_function,view_as_real.default,backward,23,1,1,1,3610,2560,6 +3880,view_748,call_function,view.default,backward,23,1,1,1,3611,2559,6 +3881,convert_element_type_892,call_function,convert_element_type.default,backward,23,1,1,1,3612,2558,6 +3882,view_749,call_function,view.default,backward,23,1,1,1,3604,2558,2 +3883,view_750,call_function,view.default,backward,23,1,1,1,3613,2558,5 +3884,view_751,call_function,view.default,backward,23,1,1,1,3613,2557,5 +3885,alias_default_871,call_function,alias.default,backward,23,1,1,2,3605,2557,4 +3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5 +3887,permute_463,call_function,permute.default,backward,23,1,1,1,4,2553,3 +3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5 +3889,permute_464,call_function,permute.default,backward,23,1,1,1,3607,2,4 +3890,dtype_cast_298,call_function,dtype_cast.default,backward,23,1,1,1,3608,1,4 +3891,alias_default_1456,call_function,alias.default,backward,23,1,1,0,3609,0,3 +3892,alias_default_872,call_function,alias.default,backward,23,1,1,2,3614,2557,4 +3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5 +3894,permute_467,call_function,permute.default,backward,23,1,1,1,4,2553,3 +3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5 +3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10 +3897,permute_468,call_function,permute.default,backward,23,1,1,1,3616,2,4 +3898,dtype_cast_299,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4 +3899,alias_default_1455,call_function,alias.default,backward,23,1,1,0,3618,0,3 +3900,alias_default_873,call_function,alias.default,backward,23,1,1,2,3614,2556,4 +3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5 +3902,permute_471,call_function,permute.default,backward,23,1,1,1,4,2552,3 +3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5 +3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10 +3905,permute_472,call_function,permute.default,backward,23,1,1,1,3616,2,4 +3906,dtype_cast_300,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4 +3907,alias_default_1454,call_function,alias.default,backward,23,1,1,0,3618,0,3 +3908,convert_element_type_905,call_function,convert_element_type.default,backward,23,1,1,1,3640,2549,8 +3909,convert_element_type_906,call_function,convert_element_type.default,backward,23,1,1,1,2539,2549,4 +3910,convert_element_type_907,call_function,convert_element_type.default,backward,23,1,1,1,3,2543,2 +3911,alias_default_874,call_function,alias.default,backward,23,1,1,2,3641,2548,4 +3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8 +3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8 +3914,alias_default_875,call_function,alias.default,backward,23,1,1,2,3644,2541,4 +3915,alias_default_876,call_function,alias.default,backward,23,1,1,3,2548,2547,4 +3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8 +3917,sum_21,call_function,sum.dim_IntList,backward,23,1,1,1,3649,2539,5 +3918,div_38,call_function,div.Tensor,backward,23,1,1,1,2549,2539,6 +3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8 +3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10 +3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8 +3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8 +3923,sum_22,call_function,sum.dim_IntList,backward,23,1,1,1,3646,3,5 +3924,convert_element_type_908,call_function,convert_element_type.default,backward,23,1,1,1,3654,2535,6 +3925,convert_element_type_909,call_function,convert_element_type.default,backward,23,1,1,1,3647,2,3 +3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10 +3927,dtype_cast_301,call_function,dtype_cast.default,backward,23,1,1,1,3648,1,3 +3928,alias_default_1461,call_function,alias.default,backward,23,1,1,0,3649,0,2 +3929,alias_default_877,call_function,alias.default,unknown,,1,1,3,3656,2533,4 +3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5 +3931,permute_475,call_function,permute.default,backward,22,1,1,1,4,2529,3 +3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5 +3933,permute_476,call_function,permute.default,backward,22,1,1,1,3658,2,4 +3934,dtype_cast_302,call_function,dtype_cast.default,backward,22,1,1,1,3659,1,4 +3935,alias_default_1450,call_function,alias.default,backward,22,1,1,0,3660,0,3 +3936,alias_default_878,call_function,alias.default,backward,22,1,1,2,3659,2527,4 +3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8 +3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8 +3939,alias_default_879,call_function,alias.default,backward,22,1,1,2,3661,2514,4 +3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5 +3941,permute_479,call_function,permute.default,backward,22,1,1,1,4,2510,3 +3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5 +3943,permute_480,call_function,permute.default,backward,22,1,1,1,3663,2,4 +3944,dtype_cast_303,call_function,dtype_cast.default,backward,22,1,1,1,3664,1,4 +3945,alias_default_1451,call_function,alias.default,backward,22,1,1,0,3665,0,3 +3946,convert_element_type_918,call_function,convert_element_type.default,backward,22,1,1,1,3661,2518,6 +3947,convert_element_type_919,call_function,convert_element_type.default,backward,22,1,1,1,2516,2528,4 +3948,alias_default_880,call_function,alias.default,backward,22,1,1,2,2517,2527,4 +3949,neg_33,call_function,neg.default,backward,22,1,1,1,2518,2526,8 +3950,exp_33,call_function,exp.default,backward,22,1,1,1,2519,2525,6 +3951,add_176,call_function,add.Tensor,backward,22,1,1,1,2520,2524,4 +3952,reciprocal_5,call_function,reciprocal.default,backward,22,1,1,1,2521,2523,4 +3953,mul_306,call_function,mul.Tensor,backward,22,1,1,1,2522,2522,6 +3954,alias_default_881,call_function,alias.default,backward,22,1,1,2,2523,2521,4 +3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8 +3956,sub_16,call_function,sub.Tensor,backward,22,1,1,1,2524,2519,4 +3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8 +3958,add_177,call_function,add.Tensor,backward,22,1,1,1,2526,2517,4 +3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8 +3960,convert_element_type_920,call_function,convert_element_type.default,backward,22,1,1,1,3675,2515,6 +3961,alias_default_882,call_function,alias.default,backward,22,1,1,2,3676,2514,4 +3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5 +3963,permute_483,call_function,permute.default,backward,22,1,1,1,4,2510,3 +3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5 +3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10 +3966,permute_484,call_function,permute.default,backward,22,1,1,1,3678,2,4 +3967,dtype_cast_304,call_function,dtype_cast.default,backward,22,1,1,1,3679,1,4 +3968,alias_default_1449,call_function,alias.default,backward,22,1,1,0,3680,0,3 +3969,convert_element_type_925,call_function,convert_element_type.default,backward,22,1,1,1,3684,2507,8 +3970,convert_element_type_926,call_function,convert_element_type.default,backward,22,1,1,1,2496,2507,4 +3971,convert_element_type_927,call_function,convert_element_type.default,backward,22,1,1,1,3,2501,2 +3972,alias_default_883,call_function,alias.default,backward,22,1,1,2,3685,2506,4 +3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8 +3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8 +3975,alias_default_884,call_function,alias.default,backward,22,1,1,2,3688,2499,4 +3976,alias_default_885,call_function,alias.default,backward,22,1,1,3,2505,2505,4 +3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8 +3978,sum_23,call_function,sum.dim_IntList,backward,22,1,1,1,3693,2497,5 +3979,div_39,call_function,div.Tensor,backward,22,1,1,1,2506,2497,6 +3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8 +3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10 +3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8 +3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8 +3984,sum_24,call_function,sum.dim_IntList,backward,22,1,1,1,3690,3,5 +3985,convert_element_type_928,call_function,convert_element_type.default,backward,22,1,1,1,3698,2493,6 +3986,convert_element_type_929,call_function,convert_element_type.default,backward,22,1,1,1,3691,2,3 +3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10 +3988,dtype_cast_305,call_function,dtype_cast.default,backward,22,1,1,1,3692,1,3 +3989,alias_default_1453,call_function,alias.default,backward,22,1,1,0,3693,0,2 +3990,alias_default_886,call_function,alias.default,unknown,,1,1,3,3700,2491,4 +3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5 +3992,permute_487,call_function,permute.default,backward,22,1,1,1,4,2487,3 +3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5 +3994,permute_488,call_function,permute.default,backward,22,1,1,1,3702,2,4 +3995,dtype_cast_306,call_function,dtype_cast.default,backward,22,1,1,1,3703,1,4 +3996,alias_default_1448,call_function,alias.default,backward,22,1,1,0,3704,0,3 +3997,view_766,call_function,view.default,backward,22,1,1,1,3703,2485,4 +3998,permute_489,call_function,permute.default,backward,22,1,1,1,3704,2484,4 +3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2 +4000,getitem_267,call_function,getitem,backward,22,1,1,1,3709,2456,2 +4001,getitem_268,call_function,getitem,backward,22,1,1,1,3709,2457,2 +4002,getitem_269,call_function,getitem,backward,22,1,1,1,3709,2450,2 +4003,permute_490,call_function,permute.default,backward,22,1,1,1,3710,2449,2 +4004,permute_491,call_function,permute.default,backward,22,1,1,1,3710,2456,2 +4005,permute_492,call_function,permute.default,backward,22,1,1,1,3710,2455,2 +4006,convert_element_type_934,call_function,convert_element_type.default,backward,22,1,1,1,3711,2455,2 +4007,convert_element_type_935,call_function,convert_element_type.default,backward,22,1,1,1,3711,2454,2 +4008,view_767,call_function,view.default,backward,22,1,1,1,3712,2454,2 +4009,view_as_complex_66,call_function,view_as_complex.default,backward,22,1,1,1,3713,2453,6 +4010,_conj_10,call_function,_conj.default,backward,22,1,1,1,4,2454,3 +4011,clone_46,call_function,clone.default,backward,22,1,1,1,5,2453,3 +4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8 +4013,view_768,call_function,view.default,backward,22,1,1,1,3712,2453,2 +4014,view_as_complex_67,call_function,view_as_complex.default,backward,22,1,1,1,3713,2452,6 +4015,_conj_11,call_function,_conj.default,backward,22,1,1,1,4,2453,3 +4016,clone_47,call_function,clone.default,backward,22,1,1,1,5,2452,3 +4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8 +4018,view_as_real_66,call_function,view_as_real.default,backward,22,1,1,1,3717,2451,6 +4019,view_769,call_function,view.default,backward,22,1,1,1,3718,2450,6 +4020,convert_element_type_936,call_function,convert_element_type.default,backward,22,1,1,1,3719,2449,6 +4021,view_as_real_67,call_function,view_as_real.default,backward,22,1,1,1,3717,2450,6 +4022,view_770,call_function,view.default,backward,22,1,1,1,3718,2449,6 +4023,convert_element_type_937,call_function,convert_element_type.default,backward,22,1,1,1,3719,2448,6 +4024,view_771,call_function,view.default,backward,22,1,1,1,3711,2448,2 +4025,view_772,call_function,view.default,backward,22,1,1,1,3720,2448,5 +4026,view_773,call_function,view.default,backward,22,1,1,1,3720,2447,5 +4027,alias_default_887,call_function,alias.default,backward,22,1,1,2,3712,2447,4 +4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5 +4029,permute_495,call_function,permute.default,backward,22,1,1,1,4,2443,3 +4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5 +4031,permute_496,call_function,permute.default,backward,22,1,1,1,3714,2,4 +4032,dtype_cast_307,call_function,dtype_cast.default,backward,22,1,1,1,3715,1,4 +4033,alias_default_1447,call_function,alias.default,backward,22,1,1,0,3716,0,3 +4034,alias_default_888,call_function,alias.default,backward,22,1,1,2,3721,2447,4 +4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5 +4036,permute_499,call_function,permute.default,backward,22,1,1,1,4,2443,3 +4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5 +4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10 +4039,permute_500,call_function,permute.default,backward,22,1,1,1,3723,2,4 +4040,dtype_cast_308,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4 +4041,alias_default_1446,call_function,alias.default,backward,22,1,1,0,3725,0,3 +4042,alias_default_889,call_function,alias.default,backward,22,1,1,2,3721,2446,4 +4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5 +4044,permute_503,call_function,permute.default,backward,22,1,1,1,4,2442,3 +4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5 +4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10 +4047,permute_504,call_function,permute.default,backward,22,1,1,1,3723,2,4 +4048,dtype_cast_309,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4 +4049,alias_default_1445,call_function,alias.default,backward,22,1,1,0,3725,0,3 +4050,convert_element_type_950,call_function,convert_element_type.default,backward,22,1,1,1,3747,2439,8 +4051,convert_element_type_951,call_function,convert_element_type.default,backward,22,1,1,1,2429,2439,4 +4052,convert_element_type_952,call_function,convert_element_type.default,backward,22,1,1,1,3,2433,2 +4053,alias_default_890,call_function,alias.default,backward,22,1,1,2,3748,2438,4 +4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8 +4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8 +4056,alias_default_891,call_function,alias.default,backward,22,1,1,2,3751,2431,4 +4057,alias_default_892,call_function,alias.default,backward,22,1,1,3,2438,2437,4 +4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8 +4059,sum_25,call_function,sum.dim_IntList,backward,22,1,1,1,3756,2429,5 +4060,div_40,call_function,div.Tensor,backward,22,1,1,1,2439,2429,6 +4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8 +4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10 +4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8 +4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8 +4065,sum_26,call_function,sum.dim_IntList,backward,22,1,1,1,3753,3,5 +4066,convert_element_type_953,call_function,convert_element_type.default,backward,22,1,1,1,3761,2425,6 +4067,convert_element_type_954,call_function,convert_element_type.default,backward,22,1,1,1,3754,2,3 +4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10 +4069,dtype_cast_310,call_function,dtype_cast.default,backward,22,1,1,1,3755,1,3 +4070,alias_default_1452,call_function,alias.default,backward,22,1,1,0,3756,0,2 +4071,alias_default_893,call_function,alias.default,unknown,,1,1,3,3763,2423,4 +4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5 +4073,permute_507,call_function,permute.default,backward,21,1,1,1,4,2419,3 +4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5 +4075,permute_508,call_function,permute.default,backward,21,1,1,1,3765,2,4 +4076,dtype_cast_311,call_function,dtype_cast.default,backward,21,1,1,1,3766,1,4 +4077,alias_default_1441,call_function,alias.default,backward,21,1,1,0,3767,0,3 +4078,alias_default_894,call_function,alias.default,backward,21,1,1,2,3766,2417,4 +4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8 +4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8 +4081,alias_default_895,call_function,alias.default,backward,21,1,1,2,3768,2404,4 +4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5 +4083,permute_511,call_function,permute.default,backward,21,1,1,1,4,2400,3 +4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5 +4085,permute_512,call_function,permute.default,backward,21,1,1,1,3770,2,4 +4086,dtype_cast_312,call_function,dtype_cast.default,backward,21,1,1,1,3771,1,4 +4087,alias_default_1442,call_function,alias.default,backward,21,1,1,0,3772,0,3 +4088,convert_element_type_963,call_function,convert_element_type.default,backward,21,1,1,1,3768,2408,6 +4089,convert_element_type_964,call_function,convert_element_type.default,backward,21,1,1,1,2406,2418,4 +4090,alias_default_896,call_function,alias.default,backward,21,1,1,2,2407,2417,4 +4091,neg_34,call_function,neg.default,backward,21,1,1,1,2408,2416,8 +4092,exp_34,call_function,exp.default,backward,21,1,1,1,2409,2415,6 +4093,add_183,call_function,add.Tensor,backward,21,1,1,1,2410,2414,4 +4094,reciprocal_6,call_function,reciprocal.default,backward,21,1,1,1,2411,2413,4 +4095,mul_326,call_function,mul.Tensor,backward,21,1,1,1,2412,2412,6 +4096,alias_default_897,call_function,alias.default,backward,21,1,1,2,2413,2411,4 +4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8 +4098,sub_19,call_function,sub.Tensor,backward,21,1,1,1,2414,2409,4 +4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8 +4100,add_184,call_function,add.Tensor,backward,21,1,1,1,2416,2407,4 +4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8 +4102,convert_element_type_965,call_function,convert_element_type.default,backward,21,1,1,1,3782,2405,6 +4103,alias_default_898,call_function,alias.default,backward,21,1,1,2,3783,2404,4 +4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5 +4105,permute_515,call_function,permute.default,backward,21,1,1,1,4,2400,3 +4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5 +4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10 +4108,permute_516,call_function,permute.default,backward,21,1,1,1,3785,2,4 +4109,dtype_cast_313,call_function,dtype_cast.default,backward,21,1,1,1,3786,1,4 +4110,alias_default_1440,call_function,alias.default,backward,21,1,1,0,3787,0,3 +4111,convert_element_type_970,call_function,convert_element_type.default,backward,21,1,1,1,3791,2397,8 +4112,convert_element_type_971,call_function,convert_element_type.default,backward,21,1,1,1,2386,2397,4 +4113,convert_element_type_972,call_function,convert_element_type.default,backward,21,1,1,1,3,2391,2 +4114,alias_default_899,call_function,alias.default,backward,21,1,1,2,3792,2396,4 +4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8 +4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8 +4117,alias_default_900,call_function,alias.default,backward,21,1,1,2,3795,2389,4 +4118,alias_default_901,call_function,alias.default,backward,21,1,1,3,2395,2395,4 +4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8 +4120,sum_27,call_function,sum.dim_IntList,backward,21,1,1,1,3800,2387,5 +4121,div_41,call_function,div.Tensor,backward,21,1,1,1,2396,2387,6 +4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8 +4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10 +4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8 +4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8 +4126,sum_28,call_function,sum.dim_IntList,backward,21,1,1,1,3797,3,5 +4127,convert_element_type_973,call_function,convert_element_type.default,backward,21,1,1,1,3805,2383,6 +4128,convert_element_type_974,call_function,convert_element_type.default,backward,21,1,1,1,3798,2,3 +4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10 +4130,dtype_cast_314,call_function,dtype_cast.default,backward,21,1,1,1,3799,1,3 +4131,alias_default_1444,call_function,alias.default,backward,21,1,1,0,3800,0,2 +4132,alias_default_902,call_function,alias.default,unknown,,1,1,3,3807,2381,4 +4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5 +4134,permute_519,call_function,permute.default,backward,21,1,1,1,4,2377,3 +4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5 +4136,permute_520,call_function,permute.default,backward,21,1,1,1,3809,2,4 +4137,dtype_cast_315,call_function,dtype_cast.default,backward,21,1,1,1,3810,1,4 +4138,alias_default_1439,call_function,alias.default,backward,21,1,1,0,3811,0,3 +4139,view_788,call_function,view.default,backward,21,1,1,1,3810,2375,4 +4140,permute_521,call_function,permute.default,backward,21,1,1,1,3811,2374,4 +4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2 +4142,getitem_270,call_function,getitem,backward,21,1,1,1,3816,2346,2 +4143,getitem_271,call_function,getitem,backward,21,1,1,1,3816,2347,2 +4144,getitem_272,call_function,getitem,backward,21,1,1,1,3816,2340,2 +4145,permute_522,call_function,permute.default,backward,21,1,1,1,3817,2339,2 +4146,permute_523,call_function,permute.default,backward,21,1,1,1,3817,2346,2 +4147,permute_524,call_function,permute.default,backward,21,1,1,1,3817,2345,2 +4148,convert_element_type_979,call_function,convert_element_type.default,backward,21,1,1,1,3818,2345,2 +4149,convert_element_type_980,call_function,convert_element_type.default,backward,21,1,1,1,3818,2344,2 +4150,view_789,call_function,view.default,backward,21,1,1,1,3819,2344,2 +4151,view_as_complex_68,call_function,view_as_complex.default,backward,21,1,1,1,3820,2343,6 +4152,_conj_12,call_function,_conj.default,backward,21,1,1,1,4,2344,3 +4153,clone_54,call_function,clone.default,backward,21,1,1,1,5,2343,3 +4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8 +4155,view_790,call_function,view.default,backward,21,1,1,1,3819,2343,2 +4156,view_as_complex_69,call_function,view_as_complex.default,backward,21,1,1,1,3820,2342,6 +4157,_conj_13,call_function,_conj.default,backward,21,1,1,1,4,2343,3 +4158,clone_55,call_function,clone.default,backward,21,1,1,1,5,2342,3 +4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8 +4160,view_as_real_68,call_function,view_as_real.default,backward,21,1,1,1,3824,2341,6 +4161,view_791,call_function,view.default,backward,21,1,1,1,3825,2340,6 +4162,convert_element_type_981,call_function,convert_element_type.default,backward,21,1,1,1,3826,2339,6 +4163,view_as_real_69,call_function,view_as_real.default,backward,21,1,1,1,3824,2340,6 +4164,view_792,call_function,view.default,backward,21,1,1,1,3825,2339,6 +4165,convert_element_type_982,call_function,convert_element_type.default,backward,21,1,1,1,3826,2338,6 +4166,view_793,call_function,view.default,backward,21,1,1,1,3818,2338,2 +4167,view_794,call_function,view.default,backward,21,1,1,1,3827,2338,5 +4168,view_795,call_function,view.default,backward,21,1,1,1,3827,2337,5 +4169,alias_default_903,call_function,alias.default,backward,21,1,1,2,3819,2337,4 +4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5 +4171,permute_527,call_function,permute.default,backward,21,1,1,1,4,2333,3 +4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5 +4173,permute_528,call_function,permute.default,backward,21,1,1,1,3821,2,4 +4174,dtype_cast_316,call_function,dtype_cast.default,backward,21,1,1,1,3822,1,4 +4175,alias_default_1438,call_function,alias.default,backward,21,1,1,0,3823,0,3 +4176,alias_default_904,call_function,alias.default,backward,21,1,1,2,3828,2337,4 +4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5 +4178,permute_531,call_function,permute.default,backward,21,1,1,1,4,2333,3 +4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5 +4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10 +4181,permute_532,call_function,permute.default,backward,21,1,1,1,3830,2,4 +4182,dtype_cast_317,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4 +4183,alias_default_1437,call_function,alias.default,backward,21,1,1,0,3832,0,3 +4184,alias_default_905,call_function,alias.default,backward,21,1,1,2,3828,2336,4 +4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5 +4186,permute_535,call_function,permute.default,backward,21,1,1,1,4,2332,3 +4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5 +4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10 +4189,permute_536,call_function,permute.default,backward,21,1,1,1,3830,2,4 +4190,dtype_cast_318,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4 +4191,alias_default_1436,call_function,alias.default,backward,21,1,1,0,3832,0,3 +4192,convert_element_type_995,call_function,convert_element_type.default,backward,21,1,1,1,3854,2329,8 +4193,convert_element_type_996,call_function,convert_element_type.default,backward,21,1,1,1,2319,2329,4 +4194,convert_element_type_997,call_function,convert_element_type.default,backward,21,1,1,1,3,2323,2 +4195,alias_default_906,call_function,alias.default,backward,21,1,1,2,3855,2328,4 +4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8 +4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8 +4198,alias_default_907,call_function,alias.default,backward,21,1,1,2,3858,2321,4 +4199,alias_default_908,call_function,alias.default,backward,21,1,1,3,2328,2327,4 +4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8 +4201,sum_29,call_function,sum.dim_IntList,backward,21,1,1,1,3863,2319,5 +4202,div_42,call_function,div.Tensor,backward,21,1,1,1,2329,2319,6 +4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8 +4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10 +4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8 +4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8 +4207,sum_30,call_function,sum.dim_IntList,backward,21,1,1,1,3860,3,5 +4208,convert_element_type_998,call_function,convert_element_type.default,backward,21,1,1,1,3868,2315,6 +4209,convert_element_type_999,call_function,convert_element_type.default,backward,21,1,1,1,3861,2,3 +4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10 +4211,dtype_cast_319,call_function,dtype_cast.default,backward,21,1,1,1,3862,1,3 +4212,alias_default_1443,call_function,alias.default,backward,21,1,1,0,3863,0,2 +4213,alias_default_909,call_function,alias.default,unknown,,1,1,3,3870,2313,4 +4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5 +4215,permute_539,call_function,permute.default,backward,20,1,1,1,4,2309,3 +4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5 +4217,permute_540,call_function,permute.default,backward,20,1,1,1,3872,2,4 +4218,dtype_cast_320,call_function,dtype_cast.default,backward,20,1,1,1,3873,1,4 +4219,alias_default_1432,call_function,alias.default,backward,20,1,1,0,3874,0,3 +4220,alias_default_910,call_function,alias.default,backward,20,1,1,2,3873,2307,4 +4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8 +4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8 +4223,alias_default_911,call_function,alias.default,backward,20,1,1,2,3875,2294,4 +4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5 +4225,permute_543,call_function,permute.default,backward,20,1,1,1,4,2290,3 +4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5 +4227,permute_544,call_function,permute.default,backward,20,1,1,1,3877,2,4 +4228,dtype_cast_321,call_function,dtype_cast.default,backward,20,1,1,1,3878,1,4 +4229,alias_default_1433,call_function,alias.default,backward,20,1,1,0,3879,0,3 +4230,convert_element_type_1008,call_function,convert_element_type.default,backward,20,1,1,1,3875,2298,6 +4231,convert_element_type_1009,call_function,convert_element_type.default,backward,20,1,1,1,2296,2308,4 +4232,alias_default_912,call_function,alias.default,backward,20,1,1,2,2297,2307,4 +4233,neg_35,call_function,neg.default,backward,20,1,1,1,2298,2306,8 +4234,exp_35,call_function,exp.default,backward,20,1,1,1,2299,2305,6 +4235,add_190,call_function,add.Tensor,backward,20,1,1,1,2300,2304,4 +4236,reciprocal_7,call_function,reciprocal.default,backward,20,1,1,1,2301,2303,4 +4237,mul_346,call_function,mul.Tensor,backward,20,1,1,1,2302,2302,6 +4238,alias_default_913,call_function,alias.default,backward,20,1,1,2,2303,2301,4 +4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8 +4240,sub_22,call_function,sub.Tensor,backward,20,1,1,1,2304,2299,4 +4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8 +4242,add_191,call_function,add.Tensor,backward,20,1,1,1,2306,2297,4 +4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8 +4244,convert_element_type_1010,call_function,convert_element_type.default,backward,20,1,1,1,3889,2295,6 +4245,alias_default_914,call_function,alias.default,backward,20,1,1,2,3890,2294,4 +4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5 +4247,permute_547,call_function,permute.default,backward,20,1,1,1,4,2290,3 +4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5 +4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10 +4250,permute_548,call_function,permute.default,backward,20,1,1,1,3892,2,4 +4251,dtype_cast_322,call_function,dtype_cast.default,backward,20,1,1,1,3893,1,4 +4252,alias_default_1431,call_function,alias.default,backward,20,1,1,0,3894,0,3 +4253,convert_element_type_1015,call_function,convert_element_type.default,backward,20,1,1,1,3898,2287,8 +4254,convert_element_type_1016,call_function,convert_element_type.default,backward,20,1,1,1,2276,2287,4 +4255,convert_element_type_1017,call_function,convert_element_type.default,backward,20,1,1,1,3,2281,2 +4256,alias_default_915,call_function,alias.default,backward,20,1,1,2,3899,2286,4 +4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8 +4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8 +4259,alias_default_916,call_function,alias.default,backward,20,1,1,2,3902,2279,4 +4260,alias_default_917,call_function,alias.default,backward,20,1,1,3,2285,2285,4 +4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8 +4262,sum_31,call_function,sum.dim_IntList,backward,20,1,1,1,3907,2277,5 +4263,div_43,call_function,div.Tensor,backward,20,1,1,1,2286,2277,6 +4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8 +4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10 +4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8 +4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8 +4268,sum_32,call_function,sum.dim_IntList,backward,20,1,1,1,3904,3,5 +4269,convert_element_type_1018,call_function,convert_element_type.default,backward,20,1,1,1,3912,2273,6 +4270,convert_element_type_1019,call_function,convert_element_type.default,backward,20,1,1,1,3905,2,3 +4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10 +4272,dtype_cast_323,call_function,dtype_cast.default,backward,20,1,1,1,3906,1,3 +4273,alias_default_1435,call_function,alias.default,backward,20,1,1,0,3907,0,2 +4274,alias_default_918,call_function,alias.default,unknown,,1,1,3,3914,2271,4 +4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5 +4276,permute_551,call_function,permute.default,backward,20,1,1,1,4,2267,3 +4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5 +4278,permute_552,call_function,permute.default,backward,20,1,1,1,3916,2,4 +4279,dtype_cast_324,call_function,dtype_cast.default,backward,20,1,1,1,3917,1,4 +4280,alias_default_1430,call_function,alias.default,backward,20,1,1,0,3918,0,3 +4281,view_810,call_function,view.default,backward,20,1,1,1,3917,2265,4 +4282,permute_553,call_function,permute.default,backward,20,1,1,1,3918,2264,4 +4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2 +4284,getitem_273,call_function,getitem,backward,20,1,1,1,3923,2236,2 +4285,getitem_274,call_function,getitem,backward,20,1,1,1,3923,2237,2 +4286,getitem_275,call_function,getitem,backward,20,1,1,1,3923,2230,2 +4287,permute_554,call_function,permute.default,backward,20,1,1,1,3924,2229,2 +4288,permute_555,call_function,permute.default,backward,20,1,1,1,3924,2236,2 +4289,permute_556,call_function,permute.default,backward,20,1,1,1,3924,2235,2 +4290,convert_element_type_1024,call_function,convert_element_type.default,backward,20,1,1,1,3925,2235,2 +4291,convert_element_type_1025,call_function,convert_element_type.default,backward,20,1,1,1,3925,2234,2 +4292,view_811,call_function,view.default,backward,20,1,1,1,3926,2234,2 +4293,view_as_complex_70,call_function,view_as_complex.default,backward,20,1,1,1,3927,2233,6 +4294,_conj_14,call_function,_conj.default,backward,20,1,1,1,4,2234,3 +4295,clone_62,call_function,clone.default,backward,20,1,1,1,5,2233,3 +4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8 +4297,view_812,call_function,view.default,backward,20,1,1,1,3926,2233,2 +4298,view_as_complex_71,call_function,view_as_complex.default,backward,20,1,1,1,3927,2232,6 +4299,_conj_15,call_function,_conj.default,backward,20,1,1,1,4,2233,3 +4300,clone_63,call_function,clone.default,backward,20,1,1,1,5,2232,3 +4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8 +4302,view_as_real_70,call_function,view_as_real.default,backward,20,1,1,1,3931,2231,6 +4303,view_813,call_function,view.default,backward,20,1,1,1,3932,2230,6 +4304,convert_element_type_1026,call_function,convert_element_type.default,backward,20,1,1,1,3933,2229,6 +4305,view_as_real_71,call_function,view_as_real.default,backward,20,1,1,1,3931,2230,6 +4306,view_814,call_function,view.default,backward,20,1,1,1,3932,2229,6 +4307,convert_element_type_1027,call_function,convert_element_type.default,backward,20,1,1,1,3933,2228,6 +4308,view_815,call_function,view.default,backward,20,1,1,1,3925,2228,2 +4309,view_816,call_function,view.default,backward,20,1,1,1,3934,2228,5 +4310,view_817,call_function,view.default,backward,20,1,1,1,3934,2227,5 +4311,alias_default_919,call_function,alias.default,backward,20,1,1,2,3926,2227,4 +4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5 +4313,permute_559,call_function,permute.default,backward,20,1,1,1,4,2223,3 +4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5 +4315,permute_560,call_function,permute.default,backward,20,1,1,1,3928,2,4 +4316,dtype_cast_325,call_function,dtype_cast.default,backward,20,1,1,1,3929,1,4 +4317,alias_default_1429,call_function,alias.default,backward,20,1,1,0,3930,0,3 +4318,alias_default_920,call_function,alias.default,backward,20,1,1,2,3935,2227,4 +4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5 +4320,permute_563,call_function,permute.default,backward,20,1,1,1,4,2223,3 +4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5 +4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10 +4323,permute_564,call_function,permute.default,backward,20,1,1,1,3937,2,4 +4324,dtype_cast_326,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4 +4325,alias_default_1428,call_function,alias.default,backward,20,1,1,0,3939,0,3 +4326,alias_default_921,call_function,alias.default,backward,20,1,1,2,3935,2226,4 +4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5 +4328,permute_567,call_function,permute.default,backward,20,1,1,1,4,2222,3 +4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5 +4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10 +4331,permute_568,call_function,permute.default,backward,20,1,1,1,3937,2,4 +4332,dtype_cast_327,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4 +4333,alias_default_1427,call_function,alias.default,backward,20,1,1,0,3939,0,3 +4334,convert_element_type_1040,call_function,convert_element_type.default,backward,20,1,1,1,3961,2219,8 +4335,convert_element_type_1041,call_function,convert_element_type.default,backward,20,1,1,1,2209,2219,4 +4336,convert_element_type_1042,call_function,convert_element_type.default,backward,20,1,1,1,3,2213,2 +4337,alias_default_922,call_function,alias.default,backward,20,1,1,2,3962,2218,4 +4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8 +4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8 +4340,alias_default_923,call_function,alias.default,backward,20,1,1,2,3965,2211,4 +4341,alias_default_924,call_function,alias.default,backward,20,1,1,3,2218,2217,4 +4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8 +4343,sum_33,call_function,sum.dim_IntList,backward,20,1,1,1,3970,2209,5 +4344,div_44,call_function,div.Tensor,backward,20,1,1,1,2219,2209,6 +4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8 +4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10 +4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8 +4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8 +4349,sum_34,call_function,sum.dim_IntList,backward,20,1,1,1,3967,3,5 +4350,convert_element_type_1043,call_function,convert_element_type.default,backward,20,1,1,1,3975,2205,6 +4351,convert_element_type_1044,call_function,convert_element_type.default,backward,20,1,1,1,3968,2,3 +4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10 +4353,dtype_cast_328,call_function,dtype_cast.default,backward,20,1,1,1,3969,1,3 +4354,alias_default_1434,call_function,alias.default,backward,20,1,1,0,3970,0,2 +4355,alias_default_925,call_function,alias.default,unknown,,1,1,3,3977,2203,4 +4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5 +4357,permute_571,call_function,permute.default,backward,19,1,1,1,4,2199,3 +4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5 +4359,permute_572,call_function,permute.default,backward,19,1,1,1,3979,2,4 +4360,dtype_cast_329,call_function,dtype_cast.default,backward,19,1,1,1,3980,1,4 +4361,alias_default_1423,call_function,alias.default,backward,19,1,1,0,3981,0,3 +4362,alias_default_926,call_function,alias.default,backward,19,1,1,2,3980,2197,4 +4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8 +4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8 +4365,alias_default_927,call_function,alias.default,backward,19,1,1,2,3982,2184,4 +4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5 +4367,permute_575,call_function,permute.default,backward,19,1,1,1,4,2180,3 +4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5 +4369,permute_576,call_function,permute.default,backward,19,1,1,1,3984,2,4 +4370,dtype_cast_330,call_function,dtype_cast.default,backward,19,1,1,1,3985,1,4 +4371,alias_default_1424,call_function,alias.default,backward,19,1,1,0,3986,0,3 +4372,convert_element_type_1053,call_function,convert_element_type.default,backward,19,1,1,1,3982,2188,6 +4373,convert_element_type_1054,call_function,convert_element_type.default,backward,19,1,1,1,2186,2198,4 +4374,alias_default_928,call_function,alias.default,backward,19,1,1,2,2187,2197,4 +4375,neg_36,call_function,neg.default,backward,19,1,1,1,2188,2196,8 +4376,exp_36,call_function,exp.default,backward,19,1,1,1,2189,2195,6 +4377,add_197,call_function,add.Tensor,backward,19,1,1,1,2190,2194,4 +4378,reciprocal_8,call_function,reciprocal.default,backward,19,1,1,1,2191,2193,4 +4379,mul_366,call_function,mul.Tensor,backward,19,1,1,1,2192,2192,6 +4380,alias_default_929,call_function,alias.default,backward,19,1,1,2,2193,2191,4 +4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8 +4382,sub_25,call_function,sub.Tensor,backward,19,1,1,1,2194,2189,4 +4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8 +4384,add_198,call_function,add.Tensor,backward,19,1,1,1,2196,2187,4 +4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8 +4386,convert_element_type_1055,call_function,convert_element_type.default,backward,19,1,1,1,3996,2185,6 +4387,alias_default_930,call_function,alias.default,backward,19,1,1,2,3997,2184,4 +4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5 +4389,permute_579,call_function,permute.default,backward,19,1,1,1,4,2180,3 +4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5 +4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10 +4392,permute_580,call_function,permute.default,backward,19,1,1,1,3999,2,4 +4393,dtype_cast_331,call_function,dtype_cast.default,backward,19,1,1,1,4000,1,4 +4394,alias_default_1422,call_function,alias.default,backward,19,1,1,0,4001,0,3 +4395,convert_element_type_1060,call_function,convert_element_type.default,backward,19,1,1,1,4005,2177,8 +4396,convert_element_type_1061,call_function,convert_element_type.default,backward,19,1,1,1,2166,2177,4 +4397,convert_element_type_1062,call_function,convert_element_type.default,backward,19,1,1,1,3,2171,2 +4398,alias_default_931,call_function,alias.default,backward,19,1,1,2,4006,2176,4 +4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8 +4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8 +4401,alias_default_932,call_function,alias.default,backward,19,1,1,2,4009,2169,4 +4402,alias_default_933,call_function,alias.default,backward,19,1,1,3,2175,2175,4 +4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8 +4404,sum_35,call_function,sum.dim_IntList,backward,19,1,1,1,4014,2167,5 +4405,div_45,call_function,div.Tensor,backward,19,1,1,1,2176,2167,6 +4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8 +4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10 +4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8 +4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8 +4410,sum_36,call_function,sum.dim_IntList,backward,19,1,1,1,4011,3,5 +4411,convert_element_type_1063,call_function,convert_element_type.default,backward,19,1,1,1,4019,2163,6 +4412,convert_element_type_1064,call_function,convert_element_type.default,backward,19,1,1,1,4012,2,3 +4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10 +4414,dtype_cast_332,call_function,dtype_cast.default,backward,19,1,1,1,4013,1,3 +4415,alias_default_1426,call_function,alias.default,backward,19,1,1,0,4014,0,2 +4416,alias_default_934,call_function,alias.default,unknown,,1,1,3,4021,2161,4 +4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5 +4418,permute_583,call_function,permute.default,backward,19,1,1,1,4,2157,3 +4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5 +4420,permute_584,call_function,permute.default,backward,19,1,1,1,4023,2,4 +4421,dtype_cast_333,call_function,dtype_cast.default,backward,19,1,1,1,4024,1,4 +4422,alias_default_1421,call_function,alias.default,backward,19,1,1,0,4025,0,3 +4423,view_832,call_function,view.default,backward,19,1,1,1,4024,2155,4 +4424,permute_585,call_function,permute.default,backward,19,1,1,1,4025,2154,4 +4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2 +4426,getitem_276,call_function,getitem,backward,19,1,1,1,4030,2126,2 +4427,getitem_277,call_function,getitem,backward,19,1,1,1,4030,2127,2 +4428,getitem_278,call_function,getitem,backward,19,1,1,1,4030,2120,2 +4429,permute_586,call_function,permute.default,backward,19,1,1,1,4031,2119,2 +4430,permute_587,call_function,permute.default,backward,19,1,1,1,4031,2126,2 +4431,permute_588,call_function,permute.default,backward,19,1,1,1,4031,2125,2 +4432,convert_element_type_1069,call_function,convert_element_type.default,backward,19,1,1,1,4032,2125,2 +4433,convert_element_type_1070,call_function,convert_element_type.default,backward,19,1,1,1,4032,2124,2 +4434,view_833,call_function,view.default,backward,19,1,1,1,4033,2124,2 +4435,view_as_complex_72,call_function,view_as_complex.default,backward,19,1,1,1,4034,2123,6 +4436,_conj_16,call_function,_conj.default,backward,19,1,1,1,4,2124,3 +4437,clone_70,call_function,clone.default,backward,19,1,1,1,5,2123,3 +4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8 +4439,view_834,call_function,view.default,backward,19,1,1,1,4033,2123,2 +4440,view_as_complex_73,call_function,view_as_complex.default,backward,19,1,1,1,4034,2122,6 +4441,_conj_17,call_function,_conj.default,backward,19,1,1,1,4,2123,3 +4442,clone_71,call_function,clone.default,backward,19,1,1,1,5,2122,3 +4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8 +4444,view_as_real_72,call_function,view_as_real.default,backward,19,1,1,1,4038,2121,6 +4445,view_835,call_function,view.default,backward,19,1,1,1,4039,2120,6 +4446,convert_element_type_1071,call_function,convert_element_type.default,backward,19,1,1,1,4040,2119,6 +4447,view_as_real_73,call_function,view_as_real.default,backward,19,1,1,1,4038,2120,6 +4448,view_836,call_function,view.default,backward,19,1,1,1,4039,2119,6 +4449,convert_element_type_1072,call_function,convert_element_type.default,backward,19,1,1,1,4040,2118,6 +4450,view_837,call_function,view.default,backward,19,1,1,1,4032,2118,2 +4451,view_838,call_function,view.default,backward,19,1,1,1,4041,2118,5 +4452,view_839,call_function,view.default,backward,19,1,1,1,4041,2117,5 +4453,alias_default_935,call_function,alias.default,backward,19,1,1,2,4033,2117,4 +4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5 +4455,permute_591,call_function,permute.default,backward,19,1,1,1,4,2113,3 +4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5 +4457,permute_592,call_function,permute.default,backward,19,1,1,1,4035,2,4 +4458,dtype_cast_334,call_function,dtype_cast.default,backward,19,1,1,1,4036,1,4 +4459,alias_default_1420,call_function,alias.default,backward,19,1,1,0,4037,0,3 +4460,alias_default_936,call_function,alias.default,backward,19,1,1,2,4042,2117,4 +4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5 +4462,permute_595,call_function,permute.default,backward,19,1,1,1,4,2113,3 +4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5 +4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10 +4465,permute_596,call_function,permute.default,backward,19,1,1,1,4044,2,4 +4466,dtype_cast_335,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4 +4467,alias_default_1419,call_function,alias.default,backward,19,1,1,0,4046,0,3 +4468,alias_default_937,call_function,alias.default,backward,19,1,1,2,4042,2116,4 +4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5 +4470,permute_599,call_function,permute.default,backward,19,1,1,1,4,2112,3 +4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5 +4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10 +4473,permute_600,call_function,permute.default,backward,19,1,1,1,4044,2,4 +4474,dtype_cast_336,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4 +4475,alias_default_1418,call_function,alias.default,backward,19,1,1,0,4046,0,3 +4476,convert_element_type_1085,call_function,convert_element_type.default,backward,19,1,1,1,4068,2109,8 +4477,convert_element_type_1086,call_function,convert_element_type.default,backward,19,1,1,1,2099,2109,4 +4478,convert_element_type_1087,call_function,convert_element_type.default,backward,19,1,1,1,3,2103,2 +4479,alias_default_938,call_function,alias.default,backward,19,1,1,2,4069,2108,4 +4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8 +4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8 +4482,alias_default_939,call_function,alias.default,backward,19,1,1,2,4072,2101,4 +4483,alias_default_940,call_function,alias.default,backward,19,1,1,3,2108,2107,4 +4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8 +4485,sum_37,call_function,sum.dim_IntList,backward,19,1,1,1,4077,2099,5 +4486,div_46,call_function,div.Tensor,backward,19,1,1,1,2109,2099,6 +4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8 +4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10 +4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8 +4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8 +4491,sum_38,call_function,sum.dim_IntList,backward,19,1,1,1,4074,3,5 +4492,convert_element_type_1088,call_function,convert_element_type.default,backward,19,1,1,1,4082,2095,6 +4493,convert_element_type_1089,call_function,convert_element_type.default,backward,19,1,1,1,4075,2,3 +4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10 +4495,dtype_cast_337,call_function,dtype_cast.default,backward,19,1,1,1,4076,1,3 +4496,alias_default_1425,call_function,alias.default,backward,19,1,1,0,4077,0,2 +4497,alias_default_941,call_function,alias.default,unknown,,1,1,3,4084,2093,4 +4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5 +4499,permute_603,call_function,permute.default,backward,18,1,1,1,4,2089,3 +4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5 +4501,permute_604,call_function,permute.default,backward,18,1,1,1,4086,2,4 +4502,dtype_cast_338,call_function,dtype_cast.default,backward,18,1,1,1,4087,1,4 +4503,alias_default_1414,call_function,alias.default,backward,18,1,1,0,4088,0,3 +4504,alias_default_942,call_function,alias.default,backward,18,1,1,2,4087,2087,4 +4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8 +4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8 +4507,alias_default_943,call_function,alias.default,backward,18,1,1,2,4089,2074,4 +4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5 +4509,permute_607,call_function,permute.default,backward,18,1,1,1,4,2070,3 +4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5 +4511,permute_608,call_function,permute.default,backward,18,1,1,1,4091,2,4 +4512,dtype_cast_339,call_function,dtype_cast.default,backward,18,1,1,1,4092,1,4 +4513,alias_default_1415,call_function,alias.default,backward,18,1,1,0,4093,0,3 +4514,convert_element_type_1098,call_function,convert_element_type.default,backward,18,1,1,1,4089,2078,6 +4515,convert_element_type_1099,call_function,convert_element_type.default,backward,18,1,1,1,2076,2088,4 +4516,alias_default_944,call_function,alias.default,backward,18,1,1,2,2077,2087,4 +4517,neg_37,call_function,neg.default,backward,18,1,1,1,2078,2086,8 +4518,exp_37,call_function,exp.default,backward,18,1,1,1,2079,2085,6 +4519,add_204,call_function,add.Tensor,backward,18,1,1,1,2080,2084,4 +4520,reciprocal_9,call_function,reciprocal.default,backward,18,1,1,1,2081,2083,4 +4521,mul_386,call_function,mul.Tensor,backward,18,1,1,1,2082,2082,6 +4522,alias_default_945,call_function,alias.default,backward,18,1,1,2,2083,2081,4 +4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8 +4524,sub_28,call_function,sub.Tensor,backward,18,1,1,1,2084,2079,4 +4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8 +4526,add_205,call_function,add.Tensor,backward,18,1,1,1,2086,2077,4 +4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8 +4528,convert_element_type_1100,call_function,convert_element_type.default,backward,18,1,1,1,4103,2075,6 +4529,alias_default_946,call_function,alias.default,backward,18,1,1,2,4104,2074,4 +4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5 +4531,permute_611,call_function,permute.default,backward,18,1,1,1,4,2070,3 +4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5 +4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10 +4534,permute_612,call_function,permute.default,backward,18,1,1,1,4106,2,4 +4535,dtype_cast_340,call_function,dtype_cast.default,backward,18,1,1,1,4107,1,4 +4536,alias_default_1413,call_function,alias.default,backward,18,1,1,0,4108,0,3 +4537,convert_element_type_1105,call_function,convert_element_type.default,backward,18,1,1,1,4112,2067,8 +4538,convert_element_type_1106,call_function,convert_element_type.default,backward,18,1,1,1,2056,2067,4 +4539,convert_element_type_1107,call_function,convert_element_type.default,backward,18,1,1,1,3,2061,2 +4540,alias_default_947,call_function,alias.default,backward,18,1,1,2,4113,2066,4 +4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8 +4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8 +4543,alias_default_948,call_function,alias.default,backward,18,1,1,2,4116,2059,4 +4544,alias_default_949,call_function,alias.default,backward,18,1,1,3,2065,2065,4 +4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8 +4546,sum_39,call_function,sum.dim_IntList,backward,18,1,1,1,4121,2057,5 +4547,div_47,call_function,div.Tensor,backward,18,1,1,1,2066,2057,6 +4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8 +4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10 +4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8 +4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8 +4552,sum_40,call_function,sum.dim_IntList,backward,18,1,1,1,4118,3,5 +4553,convert_element_type_1108,call_function,convert_element_type.default,backward,18,1,1,1,4126,2053,6 +4554,convert_element_type_1109,call_function,convert_element_type.default,backward,18,1,1,1,4119,2,3 +4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10 +4556,dtype_cast_341,call_function,dtype_cast.default,backward,18,1,1,1,4120,1,3 +4557,alias_default_1417,call_function,alias.default,backward,18,1,1,0,4121,0,2 +4558,alias_default_950,call_function,alias.default,unknown,,1,1,3,4128,2051,4 +4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5 +4560,permute_615,call_function,permute.default,backward,18,1,1,1,4,2047,3 +4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5 +4562,permute_616,call_function,permute.default,backward,18,1,1,1,4130,2,4 +4563,dtype_cast_342,call_function,dtype_cast.default,backward,18,1,1,1,4131,1,4 +4564,alias_default_1412,call_function,alias.default,backward,18,1,1,0,4132,0,3 +4565,view_854,call_function,view.default,backward,18,1,1,1,4131,2045,4 +4566,permute_617,call_function,permute.default,backward,18,1,1,1,4132,2044,4 +4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2 +4568,getitem_279,call_function,getitem,backward,18,1,1,1,4137,2016,2 +4569,getitem_280,call_function,getitem,backward,18,1,1,1,4137,2017,2 +4570,getitem_281,call_function,getitem,backward,18,1,1,1,4137,2010,2 +4571,permute_618,call_function,permute.default,backward,18,1,1,1,4138,2009,2 +4572,permute_619,call_function,permute.default,backward,18,1,1,1,4138,2016,2 +4573,permute_620,call_function,permute.default,backward,18,1,1,1,4138,2015,2 +4574,convert_element_type_1114,call_function,convert_element_type.default,backward,18,1,1,1,4139,2015,2 +4575,convert_element_type_1115,call_function,convert_element_type.default,backward,18,1,1,1,4139,2014,2 +4576,view_855,call_function,view.default,backward,18,1,1,1,4140,2014,2 +4577,view_as_complex_74,call_function,view_as_complex.default,backward,18,1,1,1,4141,2013,6 +4578,_conj_18,call_function,_conj.default,backward,18,1,1,1,4,2014,3 +4579,clone_78,call_function,clone.default,backward,18,1,1,1,5,2013,3 +4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8 +4581,view_856,call_function,view.default,backward,18,1,1,1,4140,2013,2 +4582,view_as_complex_75,call_function,view_as_complex.default,backward,18,1,1,1,4141,2012,6 +4583,_conj_19,call_function,_conj.default,backward,18,1,1,1,4,2013,3 +4584,clone_79,call_function,clone.default,backward,18,1,1,1,5,2012,3 +4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8 +4586,view_as_real_74,call_function,view_as_real.default,backward,18,1,1,1,4145,2011,6 +4587,view_857,call_function,view.default,backward,18,1,1,1,4146,2010,6 +4588,convert_element_type_1116,call_function,convert_element_type.default,backward,18,1,1,1,4147,2009,6 +4589,view_as_real_75,call_function,view_as_real.default,backward,18,1,1,1,4145,2010,6 +4590,view_858,call_function,view.default,backward,18,1,1,1,4146,2009,6 +4591,convert_element_type_1117,call_function,convert_element_type.default,backward,18,1,1,1,4147,2008,6 +4592,view_859,call_function,view.default,backward,18,1,1,1,4139,2008,2 +4593,view_860,call_function,view.default,backward,18,1,1,1,4148,2008,5 +4594,view_861,call_function,view.default,backward,18,1,1,1,4148,2007,5 +4595,alias_default_951,call_function,alias.default,backward,18,1,1,2,4140,2007,4 +4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5 +4597,permute_623,call_function,permute.default,backward,18,1,1,1,4,2003,3 +4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5 +4599,permute_624,call_function,permute.default,backward,18,1,1,1,4142,2,4 +4600,dtype_cast_343,call_function,dtype_cast.default,backward,18,1,1,1,4143,1,4 +4601,alias_default_1411,call_function,alias.default,backward,18,1,1,0,4144,0,3 +4602,alias_default_952,call_function,alias.default,backward,18,1,1,2,4149,2007,4 +4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5 +4604,permute_627,call_function,permute.default,backward,18,1,1,1,4,2003,3 +4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5 +4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10 +4607,permute_628,call_function,permute.default,backward,18,1,1,1,4151,2,4 +4608,dtype_cast_344,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4 +4609,alias_default_1410,call_function,alias.default,backward,18,1,1,0,4153,0,3 +4610,alias_default_953,call_function,alias.default,backward,18,1,1,2,4149,2006,4 +4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5 +4612,permute_631,call_function,permute.default,backward,18,1,1,1,4,2002,3 +4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5 +4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10 +4615,permute_632,call_function,permute.default,backward,18,1,1,1,4151,2,4 +4616,dtype_cast_345,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4 +4617,alias_default_1409,call_function,alias.default,backward,18,1,1,0,4153,0,3 +4618,convert_element_type_1130,call_function,convert_element_type.default,backward,18,1,1,1,4175,1999,8 +4619,convert_element_type_1131,call_function,convert_element_type.default,backward,18,1,1,1,1989,1999,4 +4620,convert_element_type_1132,call_function,convert_element_type.default,backward,18,1,1,1,3,1993,2 +4621,alias_default_954,call_function,alias.default,backward,18,1,1,2,4176,1998,4 +4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8 +4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8 +4624,alias_default_955,call_function,alias.default,backward,18,1,1,2,4179,1991,4 +4625,alias_default_956,call_function,alias.default,backward,18,1,1,3,1998,1997,4 +4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8 +4627,sum_41,call_function,sum.dim_IntList,backward,18,1,1,1,4184,1989,5 +4628,div_48,call_function,div.Tensor,backward,18,1,1,1,1999,1989,6 +4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8 +4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10 +4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8 +4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8 +4633,sum_42,call_function,sum.dim_IntList,backward,18,1,1,1,4181,3,5 +4634,convert_element_type_1133,call_function,convert_element_type.default,backward,18,1,1,1,4189,1985,6 +4635,convert_element_type_1134,call_function,convert_element_type.default,backward,18,1,1,1,4182,2,3 +4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10 +4637,dtype_cast_346,call_function,dtype_cast.default,backward,18,1,1,1,4183,1,3 +4638,alias_default_1416,call_function,alias.default,backward,18,1,1,0,4184,0,2 +4639,alias_default_957,call_function,alias.default,unknown,,1,1,3,4191,1983,4 +4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5 +4641,permute_635,call_function,permute.default,backward,17,1,1,1,4,1979,3 +4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5 +4643,permute_636,call_function,permute.default,backward,17,1,1,1,4193,2,4 +4644,dtype_cast_347,call_function,dtype_cast.default,backward,17,1,1,1,4194,1,4 +4645,alias_default_1405,call_function,alias.default,backward,17,1,1,0,4195,0,3 +4646,alias_default_958,call_function,alias.default,backward,17,1,1,2,4194,1977,4 +4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8 +4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8 +4649,alias_default_959,call_function,alias.default,backward,17,1,1,2,4196,1964,4 +4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5 +4651,permute_639,call_function,permute.default,backward,17,1,1,1,4,1960,3 +4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5 +4653,permute_640,call_function,permute.default,backward,17,1,1,1,4198,2,4 +4654,dtype_cast_348,call_function,dtype_cast.default,backward,17,1,1,1,4199,1,4 +4655,alias_default_1406,call_function,alias.default,backward,17,1,1,0,4200,0,3 +4656,convert_element_type_1143,call_function,convert_element_type.default,backward,17,1,1,1,4196,1968,6 +4657,convert_element_type_1144,call_function,convert_element_type.default,backward,17,1,1,1,1966,1978,4 +4658,alias_default_960,call_function,alias.default,backward,17,1,1,2,1967,1977,4 +4659,neg_38,call_function,neg.default,backward,17,1,1,1,1968,1976,8 +4660,exp_38,call_function,exp.default,backward,17,1,1,1,1969,1975,6 +4661,add_211,call_function,add.Tensor,backward,17,1,1,1,1970,1974,4 +4662,reciprocal_10,call_function,reciprocal.default,backward,17,1,1,1,1971,1973,4 +4663,mul_406,call_function,mul.Tensor,backward,17,1,1,1,1972,1972,6 +4664,alias_default_961,call_function,alias.default,backward,17,1,1,2,1973,1971,4 +4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8 +4666,sub_31,call_function,sub.Tensor,backward,17,1,1,1,1974,1969,4 +4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8 +4668,add_212,call_function,add.Tensor,backward,17,1,1,1,1976,1967,4 +4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8 +4670,convert_element_type_1145,call_function,convert_element_type.default,backward,17,1,1,1,4210,1965,6 +4671,alias_default_962,call_function,alias.default,backward,17,1,1,2,4211,1964,4 +4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5 +4673,permute_643,call_function,permute.default,backward,17,1,1,1,4,1960,3 +4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5 +4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10 +4676,permute_644,call_function,permute.default,backward,17,1,1,1,4213,2,4 +4677,dtype_cast_349,call_function,dtype_cast.default,backward,17,1,1,1,4214,1,4 +4678,alias_default_1404,call_function,alias.default,backward,17,1,1,0,4215,0,3 +4679,convert_element_type_1150,call_function,convert_element_type.default,backward,17,1,1,1,4219,1957,8 +4680,convert_element_type_1151,call_function,convert_element_type.default,backward,17,1,1,1,1946,1957,4 +4681,convert_element_type_1152,call_function,convert_element_type.default,backward,17,1,1,1,3,1951,2 +4682,alias_default_963,call_function,alias.default,backward,17,1,1,2,4220,1956,4 +4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8 +4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8 +4685,alias_default_964,call_function,alias.default,backward,17,1,1,2,4223,1949,4 +4686,alias_default_965,call_function,alias.default,backward,17,1,1,3,1955,1955,4 +4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8 +4688,sum_43,call_function,sum.dim_IntList,backward,17,1,1,1,4228,1947,5 +4689,div_49,call_function,div.Tensor,backward,17,1,1,1,1956,1947,6 +4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8 +4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10 +4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8 +4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8 +4694,sum_44,call_function,sum.dim_IntList,backward,17,1,1,1,4225,3,5 +4695,convert_element_type_1153,call_function,convert_element_type.default,backward,17,1,1,1,4233,1943,6 +4696,convert_element_type_1154,call_function,convert_element_type.default,backward,17,1,1,1,4226,2,3 +4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10 +4698,dtype_cast_350,call_function,dtype_cast.default,backward,17,1,1,1,4227,1,3 +4699,alias_default_1408,call_function,alias.default,backward,17,1,1,0,4228,0,2 +4700,alias_default_966,call_function,alias.default,unknown,,1,1,3,4235,1941,4 +4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5 +4702,permute_647,call_function,permute.default,backward,17,1,1,1,4,1937,3 +4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5 +4704,permute_648,call_function,permute.default,backward,17,1,1,1,4237,2,4 +4705,dtype_cast_351,call_function,dtype_cast.default,backward,17,1,1,1,4238,1,4 +4706,alias_default_1403,call_function,alias.default,backward,17,1,1,0,4239,0,3 +4707,view_876,call_function,view.default,backward,17,1,1,1,4238,1935,4 +4708,permute_649,call_function,permute.default,backward,17,1,1,1,4239,1934,4 +4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2 +4710,getitem_282,call_function,getitem,backward,17,1,1,1,4244,1906,2 +4711,getitem_283,call_function,getitem,backward,17,1,1,1,4244,1907,2 +4712,getitem_284,call_function,getitem,backward,17,1,1,1,4244,1900,2 +4713,permute_650,call_function,permute.default,backward,17,1,1,1,4245,1899,2 +4714,permute_651,call_function,permute.default,backward,17,1,1,1,4245,1906,2 +4715,permute_652,call_function,permute.default,backward,17,1,1,1,4245,1905,2 +4716,convert_element_type_1159,call_function,convert_element_type.default,backward,17,1,1,1,4246,1905,2 +4717,convert_element_type_1160,call_function,convert_element_type.default,backward,17,1,1,1,4246,1904,2 +4718,view_877,call_function,view.default,backward,17,1,1,1,4247,1904,2 +4719,view_as_complex_76,call_function,view_as_complex.default,backward,17,1,1,1,4248,1903,6 +4720,_conj_20,call_function,_conj.default,backward,17,1,1,1,4,1904,3 +4721,clone_86,call_function,clone.default,backward,17,1,1,1,5,1903,3 +4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8 +4723,view_878,call_function,view.default,backward,17,1,1,1,4247,1903,2 +4724,view_as_complex_77,call_function,view_as_complex.default,backward,17,1,1,1,4248,1902,6 +4725,_conj_21,call_function,_conj.default,backward,17,1,1,1,4,1903,3 +4726,clone_87,call_function,clone.default,backward,17,1,1,1,5,1902,3 +4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8 +4728,view_as_real_76,call_function,view_as_real.default,backward,17,1,1,1,4252,1901,6 +4729,view_879,call_function,view.default,backward,17,1,1,1,4253,1900,6 +4730,convert_element_type_1161,call_function,convert_element_type.default,backward,17,1,1,1,4254,1899,6 +4731,view_as_real_77,call_function,view_as_real.default,backward,17,1,1,1,4252,1900,6 +4732,view_880,call_function,view.default,backward,17,1,1,1,4253,1899,6 +4733,convert_element_type_1162,call_function,convert_element_type.default,backward,17,1,1,1,4254,1898,6 +4734,view_881,call_function,view.default,backward,17,1,1,1,4246,1898,2 +4735,view_882,call_function,view.default,backward,17,1,1,1,4255,1898,5 +4736,view_883,call_function,view.default,backward,17,1,1,1,4255,1897,5 +4737,alias_default_967,call_function,alias.default,backward,17,1,1,2,4247,1897,4 +4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5 +4739,permute_655,call_function,permute.default,backward,17,1,1,1,4,1893,3 +4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5 +4741,permute_656,call_function,permute.default,backward,17,1,1,1,4249,2,4 +4742,dtype_cast_352,call_function,dtype_cast.default,backward,17,1,1,1,4250,1,4 +4743,alias_default_1402,call_function,alias.default,backward,17,1,1,0,4251,0,3 +4744,alias_default_968,call_function,alias.default,backward,17,1,1,2,4256,1897,4 +4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5 +4746,permute_659,call_function,permute.default,backward,17,1,1,1,4,1893,3 +4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5 +4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10 +4749,permute_660,call_function,permute.default,backward,17,1,1,1,4258,2,4 +4750,dtype_cast_353,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4 +4751,alias_default_1401,call_function,alias.default,backward,17,1,1,0,4260,0,3 +4752,alias_default_969,call_function,alias.default,backward,17,1,1,2,4256,1896,4 +4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5 +4754,permute_663,call_function,permute.default,backward,17,1,1,1,4,1892,3 +4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5 +4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10 +4757,permute_664,call_function,permute.default,backward,17,1,1,1,4258,2,4 +4758,dtype_cast_354,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4 +4759,alias_default_1400,call_function,alias.default,backward,17,1,1,0,4260,0,3 +4760,convert_element_type_1175,call_function,convert_element_type.default,backward,17,1,1,1,4282,1889,8 +4761,convert_element_type_1176,call_function,convert_element_type.default,backward,17,1,1,1,1879,1889,4 +4762,convert_element_type_1177,call_function,convert_element_type.default,backward,17,1,1,1,3,1883,2 +4763,alias_default_970,call_function,alias.default,backward,17,1,1,2,4283,1888,4 +4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8 +4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8 +4766,alias_default_971,call_function,alias.default,backward,17,1,1,2,4286,1881,4 +4767,alias_default_972,call_function,alias.default,backward,17,1,1,3,1888,1887,4 +4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8 +4769,sum_45,call_function,sum.dim_IntList,backward,17,1,1,1,4291,1879,5 +4770,div_50,call_function,div.Tensor,backward,17,1,1,1,1889,1879,6 +4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8 +4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10 +4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8 +4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8 +4775,sum_46,call_function,sum.dim_IntList,backward,17,1,1,1,4288,3,5 +4776,convert_element_type_1178,call_function,convert_element_type.default,backward,17,1,1,1,4296,1875,6 +4777,convert_element_type_1179,call_function,convert_element_type.default,backward,17,1,1,1,4289,2,3 +4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10 +4779,dtype_cast_355,call_function,dtype_cast.default,backward,17,1,1,1,4290,1,3 +4780,alias_default_1407,call_function,alias.default,backward,17,1,1,0,4291,0,2 +4781,alias_default_973,call_function,alias.default,unknown,,1,1,3,4298,1873,4 +4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5 +4783,permute_667,call_function,permute.default,backward,16,1,1,1,4,1869,3 +4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5 +4785,permute_668,call_function,permute.default,backward,16,1,1,1,4300,2,4 +4786,dtype_cast_356,call_function,dtype_cast.default,backward,16,1,1,1,4301,1,4 +4787,alias_default_1396,call_function,alias.default,backward,16,1,1,0,4302,0,3 +4788,alias_default_974,call_function,alias.default,backward,16,1,1,2,4301,1867,4 +4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8 +4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8 +4791,alias_default_975,call_function,alias.default,backward,16,1,1,2,4303,1854,4 +4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5 +4793,permute_671,call_function,permute.default,backward,16,1,1,1,4,1850,3 +4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5 +4795,permute_672,call_function,permute.default,backward,16,1,1,1,4305,2,4 +4796,dtype_cast_357,call_function,dtype_cast.default,backward,16,1,1,1,4306,1,4 +4797,alias_default_1397,call_function,alias.default,backward,16,1,1,0,4307,0,3 +4798,convert_element_type_1188,call_function,convert_element_type.default,backward,16,1,1,1,4303,1858,6 +4799,convert_element_type_1189,call_function,convert_element_type.default,backward,16,1,1,1,1856,1868,4 +4800,alias_default_976,call_function,alias.default,backward,16,1,1,2,1857,1867,4 +4801,neg_39,call_function,neg.default,backward,16,1,1,1,1858,1866,8 +4802,exp_39,call_function,exp.default,backward,16,1,1,1,1859,1865,6 +4803,add_218,call_function,add.Tensor,backward,16,1,1,1,1860,1864,4 +4804,reciprocal_11,call_function,reciprocal.default,backward,16,1,1,1,1861,1863,4 +4805,mul_426,call_function,mul.Tensor,backward,16,1,1,1,1862,1862,6 +4806,alias_default_977,call_function,alias.default,backward,16,1,1,2,1863,1861,4 +4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8 +4808,sub_34,call_function,sub.Tensor,backward,16,1,1,1,1864,1859,4 +4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8 +4810,add_219,call_function,add.Tensor,backward,16,1,1,1,1866,1857,4 +4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8 +4812,convert_element_type_1190,call_function,convert_element_type.default,backward,16,1,1,1,4317,1855,6 +4813,alias_default_978,call_function,alias.default,backward,16,1,1,2,4318,1854,4 +4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5 +4815,permute_675,call_function,permute.default,backward,16,1,1,1,4,1850,3 +4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5 +4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10 +4818,permute_676,call_function,permute.default,backward,16,1,1,1,4320,2,4 +4819,dtype_cast_358,call_function,dtype_cast.default,backward,16,1,1,1,4321,1,4 +4820,alias_default_1395,call_function,alias.default,backward,16,1,1,0,4322,0,3 +4821,convert_element_type_1195,call_function,convert_element_type.default,backward,16,1,1,1,4326,1847,8 +4822,convert_element_type_1196,call_function,convert_element_type.default,backward,16,1,1,1,1836,1847,4 +4823,convert_element_type_1197,call_function,convert_element_type.default,backward,16,1,1,1,3,1841,2 +4824,alias_default_979,call_function,alias.default,backward,16,1,1,2,4327,1846,4 +4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8 +4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8 +4827,alias_default_980,call_function,alias.default,backward,16,1,1,2,4330,1839,4 +4828,alias_default_981,call_function,alias.default,backward,16,1,1,3,1845,1845,4 +4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8 +4830,sum_47,call_function,sum.dim_IntList,backward,16,1,1,1,4335,1837,5 +4831,div_51,call_function,div.Tensor,backward,16,1,1,1,1846,1837,6 +4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8 +4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10 +4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8 +4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8 +4836,sum_48,call_function,sum.dim_IntList,backward,16,1,1,1,4332,3,5 +4837,convert_element_type_1198,call_function,convert_element_type.default,backward,16,1,1,1,4340,1833,6 +4838,convert_element_type_1199,call_function,convert_element_type.default,backward,16,1,1,1,4333,2,3 +4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10 +4840,dtype_cast_359,call_function,dtype_cast.default,backward,16,1,1,1,4334,1,3 +4841,alias_default_1399,call_function,alias.default,backward,16,1,1,0,4335,0,2 +4842,alias_default_982,call_function,alias.default,unknown,,1,1,3,4342,1831,4 +4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5 +4844,permute_679,call_function,permute.default,backward,16,1,1,1,4,1827,3 +4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5 +4846,permute_680,call_function,permute.default,backward,16,1,1,1,4344,2,4 +4847,dtype_cast_360,call_function,dtype_cast.default,backward,16,1,1,1,4345,1,4 +4848,alias_default_1394,call_function,alias.default,backward,16,1,1,0,4346,0,3 +4849,view_898,call_function,view.default,backward,16,1,1,1,4345,1825,4 +4850,permute_681,call_function,permute.default,backward,16,1,1,1,4346,1824,4 +4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2 +4852,getitem_285,call_function,getitem,backward,16,1,1,1,4351,1796,2 +4853,getitem_286,call_function,getitem,backward,16,1,1,1,4351,1797,2 +4854,getitem_287,call_function,getitem,backward,16,1,1,1,4351,1790,2 +4855,permute_682,call_function,permute.default,backward,16,1,1,1,4352,1789,2 +4856,permute_683,call_function,permute.default,backward,16,1,1,1,4352,1796,2 +4857,permute_684,call_function,permute.default,backward,16,1,1,1,4352,1795,2 +4858,convert_element_type_1204,call_function,convert_element_type.default,backward,16,1,1,1,4353,1795,2 +4859,convert_element_type_1205,call_function,convert_element_type.default,backward,16,1,1,1,4353,1794,2 +4860,view_899,call_function,view.default,backward,16,1,1,1,4354,1794,2 +4861,view_as_complex_78,call_function,view_as_complex.default,backward,16,1,1,1,4355,1793,6 +4862,_conj_22,call_function,_conj.default,backward,16,1,1,1,4,1794,3 +4863,clone_94,call_function,clone.default,backward,16,1,1,1,5,1793,3 +4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8 +4865,view_900,call_function,view.default,backward,16,1,1,1,4354,1793,2 +4866,view_as_complex_79,call_function,view_as_complex.default,backward,16,1,1,1,4355,1792,6 +4867,_conj_23,call_function,_conj.default,backward,16,1,1,1,4,1793,3 +4868,clone_95,call_function,clone.default,backward,16,1,1,1,5,1792,3 +4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8 +4870,view_as_real_78,call_function,view_as_real.default,backward,16,1,1,1,4359,1791,6 +4871,view_901,call_function,view.default,backward,16,1,1,1,4360,1790,6 +4872,convert_element_type_1206,call_function,convert_element_type.default,backward,16,1,1,1,4361,1789,6 +4873,view_as_real_79,call_function,view_as_real.default,backward,16,1,1,1,4359,1790,6 +4874,view_902,call_function,view.default,backward,16,1,1,1,4360,1789,6 +4875,convert_element_type_1207,call_function,convert_element_type.default,backward,16,1,1,1,4361,1788,6 +4876,view_903,call_function,view.default,backward,16,1,1,1,4353,1788,2 +4877,view_904,call_function,view.default,backward,16,1,1,1,4362,1788,5 +4878,view_905,call_function,view.default,backward,16,1,1,1,4362,1787,5 +4879,alias_default_983,call_function,alias.default,backward,16,1,1,2,4354,1787,4 +4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5 +4881,permute_687,call_function,permute.default,backward,16,1,1,1,4,1783,3 +4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5 +4883,permute_688,call_function,permute.default,backward,16,1,1,1,4356,2,4 +4884,dtype_cast_361,call_function,dtype_cast.default,backward,16,1,1,1,4357,1,4 +4885,alias_default_1393,call_function,alias.default,backward,16,1,1,0,4358,0,3 +4886,alias_default_984,call_function,alias.default,backward,16,1,1,2,4363,1787,4 +4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5 +4888,permute_691,call_function,permute.default,backward,16,1,1,1,4,1783,3 +4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5 +4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10 +4891,permute_692,call_function,permute.default,backward,16,1,1,1,4365,2,4 +4892,dtype_cast_362,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4 +4893,alias_default_1392,call_function,alias.default,backward,16,1,1,0,4367,0,3 +4894,alias_default_985,call_function,alias.default,backward,16,1,1,2,4363,1786,4 +4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5 +4896,permute_695,call_function,permute.default,backward,16,1,1,1,4,1782,3 +4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5 +4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10 +4899,permute_696,call_function,permute.default,backward,16,1,1,1,4365,2,4 +4900,dtype_cast_363,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4 +4901,alias_default_1391,call_function,alias.default,backward,16,1,1,0,4367,0,3 +4902,convert_element_type_1220,call_function,convert_element_type.default,backward,16,1,1,1,4389,1779,8 +4903,convert_element_type_1221,call_function,convert_element_type.default,backward,16,1,1,1,1769,1779,4 +4904,convert_element_type_1222,call_function,convert_element_type.default,backward,16,1,1,1,3,1773,2 +4905,alias_default_986,call_function,alias.default,backward,16,1,1,2,4390,1778,4 +4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8 +4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8 +4908,alias_default_987,call_function,alias.default,backward,16,1,1,2,4393,1771,4 +4909,alias_default_988,call_function,alias.default,backward,16,1,1,3,1778,1777,4 +4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8 +4911,sum_49,call_function,sum.dim_IntList,backward,16,1,1,1,4398,1769,5 +4912,div_52,call_function,div.Tensor,backward,16,1,1,1,1779,1769,6 +4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8 +4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10 +4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8 +4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8 +4917,sum_50,call_function,sum.dim_IntList,backward,16,1,1,1,4395,3,5 +4918,convert_element_type_1223,call_function,convert_element_type.default,backward,16,1,1,1,4403,1765,6 +4919,convert_element_type_1224,call_function,convert_element_type.default,backward,16,1,1,1,4396,2,3 +4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10 +4921,dtype_cast_364,call_function,dtype_cast.default,backward,16,1,1,1,4397,1,3 +4922,alias_default_1398,call_function,alias.default,backward,16,1,1,0,4398,0,2 +4923,alias_default_989,call_function,alias.default,unknown,,1,1,3,4405,1763,4 +4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5 +4925,permute_699,call_function,permute.default,backward,15,1,1,1,4,1759,3 +4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5 +4927,permute_700,call_function,permute.default,backward,15,1,1,1,4407,2,4 +4928,dtype_cast_365,call_function,dtype_cast.default,backward,15,1,1,1,4408,1,4 +4929,alias_default_1387,call_function,alias.default,backward,15,1,1,0,4409,0,3 +4930,alias_default_990,call_function,alias.default,backward,15,1,1,2,4408,1757,4 +4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8 +4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8 +4933,alias_default_991,call_function,alias.default,backward,15,1,1,2,4410,1744,4 +4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5 +4935,permute_703,call_function,permute.default,backward,15,1,1,1,4,1740,3 +4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5 +4937,permute_704,call_function,permute.default,backward,15,1,1,1,4412,2,4 +4938,dtype_cast_366,call_function,dtype_cast.default,backward,15,1,1,1,4413,1,4 +4939,alias_default_1388,call_function,alias.default,backward,15,1,1,0,4414,0,3 +4940,convert_element_type_1233,call_function,convert_element_type.default,backward,15,1,1,1,4410,1748,6 +4941,convert_element_type_1234,call_function,convert_element_type.default,backward,15,1,1,1,1746,1758,4 +4942,alias_default_992,call_function,alias.default,backward,15,1,1,2,1747,1757,4 +4943,neg_40,call_function,neg.default,backward,15,1,1,1,1748,1756,8 +4944,exp_40,call_function,exp.default,backward,15,1,1,1,1749,1755,6 +4945,add_225,call_function,add.Tensor,backward,15,1,1,1,1750,1754,4 +4946,reciprocal_12,call_function,reciprocal.default,backward,15,1,1,1,1751,1753,4 +4947,mul_446,call_function,mul.Tensor,backward,15,1,1,1,1752,1752,6 +4948,alias_default_993,call_function,alias.default,backward,15,1,1,2,1753,1751,4 +4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8 +4950,sub_37,call_function,sub.Tensor,backward,15,1,1,1,1754,1749,4 +4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8 +4952,add_226,call_function,add.Tensor,backward,15,1,1,1,1756,1747,4 +4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8 +4954,convert_element_type_1235,call_function,convert_element_type.default,backward,15,1,1,1,4424,1745,6 +4955,alias_default_994,call_function,alias.default,backward,15,1,1,2,4425,1744,4 +4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5 +4957,permute_707,call_function,permute.default,backward,15,1,1,1,4,1740,3 +4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5 +4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10 +4960,permute_708,call_function,permute.default,backward,15,1,1,1,4427,2,4 +4961,dtype_cast_367,call_function,dtype_cast.default,backward,15,1,1,1,4428,1,4 +4962,alias_default_1386,call_function,alias.default,backward,15,1,1,0,4429,0,3 +4963,convert_element_type_1240,call_function,convert_element_type.default,backward,15,1,1,1,4433,1737,8 +4964,convert_element_type_1241,call_function,convert_element_type.default,backward,15,1,1,1,1726,1737,4 +4965,convert_element_type_1242,call_function,convert_element_type.default,backward,15,1,1,1,3,1731,2 +4966,alias_default_995,call_function,alias.default,backward,15,1,1,2,4434,1736,4 +4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8 +4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8 +4969,alias_default_996,call_function,alias.default,backward,15,1,1,2,4437,1729,4 +4970,alias_default_997,call_function,alias.default,backward,15,1,1,3,1735,1735,4 +4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8 +4972,sum_51,call_function,sum.dim_IntList,backward,15,1,1,1,4442,1727,5 +4973,div_53,call_function,div.Tensor,backward,15,1,1,1,1736,1727,6 +4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8 +4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10 +4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8 +4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8 +4978,sum_52,call_function,sum.dim_IntList,backward,15,1,1,1,4439,3,5 +4979,convert_element_type_1243,call_function,convert_element_type.default,backward,15,1,1,1,4447,1723,6 +4980,convert_element_type_1244,call_function,convert_element_type.default,backward,15,1,1,1,4440,2,3 +4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10 +4982,dtype_cast_368,call_function,dtype_cast.default,backward,15,1,1,1,4441,1,3 +4983,alias_default_1390,call_function,alias.default,backward,15,1,1,0,4442,0,2 +4984,alias_default_998,call_function,alias.default,unknown,,1,1,3,4449,1721,4 +4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5 +4986,permute_711,call_function,permute.default,backward,15,1,1,1,4,1717,3 +4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5 +4988,permute_712,call_function,permute.default,backward,15,1,1,1,4451,2,4 +4989,dtype_cast_369,call_function,dtype_cast.default,backward,15,1,1,1,4452,1,4 +4990,alias_default_1385,call_function,alias.default,backward,15,1,1,0,4453,0,3 +4991,view_920,call_function,view.default,backward,15,1,1,1,4452,1715,4 +4992,permute_713,call_function,permute.default,backward,15,1,1,1,4453,1714,4 +4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2 +4994,getitem_288,call_function,getitem,backward,15,1,1,1,4458,1686,2 +4995,getitem_289,call_function,getitem,backward,15,1,1,1,4458,1687,2 +4996,getitem_290,call_function,getitem,backward,15,1,1,1,4458,1680,2 +4997,permute_714,call_function,permute.default,backward,15,1,1,1,4459,1679,2 +4998,permute_715,call_function,permute.default,backward,15,1,1,1,4459,1686,2 +4999,permute_716,call_function,permute.default,backward,15,1,1,1,4459,1685,2 +5000,convert_element_type_1249,call_function,convert_element_type.default,backward,15,1,1,1,4460,1685,2 +5001,convert_element_type_1250,call_function,convert_element_type.default,backward,15,1,1,1,4460,1684,2 +5002,view_921,call_function,view.default,backward,15,1,1,1,4461,1684,2 +5003,view_as_complex_80,call_function,view_as_complex.default,backward,15,1,1,1,4462,1683,6 +5004,_conj_24,call_function,_conj.default,backward,15,1,1,1,4,1684,3 +5005,clone_102,call_function,clone.default,backward,15,1,1,1,5,1683,3 +5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8 +5007,view_922,call_function,view.default,backward,15,1,1,1,4461,1683,2 +5008,view_as_complex_81,call_function,view_as_complex.default,backward,15,1,1,1,4462,1682,6 +5009,_conj_25,call_function,_conj.default,backward,15,1,1,1,4,1683,3 +5010,clone_103,call_function,clone.default,backward,15,1,1,1,5,1682,3 +5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8 +5012,view_as_real_80,call_function,view_as_real.default,backward,15,1,1,1,4466,1681,6 +5013,view_923,call_function,view.default,backward,15,1,1,1,4467,1680,6 +5014,convert_element_type_1251,call_function,convert_element_type.default,backward,15,1,1,1,4468,1679,6 +5015,view_as_real_81,call_function,view_as_real.default,backward,15,1,1,1,4466,1680,6 +5016,view_924,call_function,view.default,backward,15,1,1,1,4467,1679,6 +5017,convert_element_type_1252,call_function,convert_element_type.default,backward,15,1,1,1,4468,1678,6 +5018,view_925,call_function,view.default,backward,15,1,1,1,4460,1678,2 +5019,view_926,call_function,view.default,backward,15,1,1,1,4469,1678,5 +5020,view_927,call_function,view.default,backward,15,1,1,1,4469,1677,5 +5021,alias_default_999,call_function,alias.default,backward,15,1,1,2,4461,1677,4 +5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5 +5023,permute_719,call_function,permute.default,backward,15,1,1,1,4,1673,3 +5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5 +5025,permute_720,call_function,permute.default,backward,15,1,1,1,4463,2,4 +5026,dtype_cast_370,call_function,dtype_cast.default,backward,15,1,1,1,4464,1,4 +5027,alias_default_1384,call_function,alias.default,backward,15,1,1,0,4465,0,3 +5028,alias_default_1000,call_function,alias.default,backward,15,1,1,2,4470,1677,4 +5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5 +5030,permute_723,call_function,permute.default,backward,15,1,1,1,4,1673,3 +5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5 +5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10 +5033,permute_724,call_function,permute.default,backward,15,1,1,1,4472,2,4 +5034,dtype_cast_371,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4 +5035,alias_default_1383,call_function,alias.default,backward,15,1,1,0,4474,0,3 +5036,alias_default_1001,call_function,alias.default,backward,15,1,1,2,4470,1676,4 +5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5 +5038,permute_727,call_function,permute.default,backward,15,1,1,1,4,1672,3 +5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5 +5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10 +5041,permute_728,call_function,permute.default,backward,15,1,1,1,4472,2,4 +5042,dtype_cast_372,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4 +5043,alias_default_1382,call_function,alias.default,backward,15,1,1,0,4474,0,3 +5044,convert_element_type_1265,call_function,convert_element_type.default,backward,15,1,1,1,4496,1669,8 +5045,convert_element_type_1266,call_function,convert_element_type.default,backward,15,1,1,1,1659,1669,4 +5046,convert_element_type_1267,call_function,convert_element_type.default,backward,15,1,1,1,3,1663,2 +5047,alias_default_1002,call_function,alias.default,backward,15,1,1,2,4497,1668,4 +5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8 +5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8 +5050,alias_default_1003,call_function,alias.default,backward,15,1,1,2,4500,1661,4 +5051,alias_default_1004,call_function,alias.default,backward,15,1,1,3,1668,1667,4 +5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8 +5053,sum_53,call_function,sum.dim_IntList,backward,15,1,1,1,4505,1659,5 +5054,div_54,call_function,div.Tensor,backward,15,1,1,1,1669,1659,6 +5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8 +5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10 +5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8 +5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8 +5059,sum_54,call_function,sum.dim_IntList,backward,15,1,1,1,4502,3,5 +5060,convert_element_type_1268,call_function,convert_element_type.default,backward,15,1,1,1,4510,1655,6 +5061,convert_element_type_1269,call_function,convert_element_type.default,backward,15,1,1,1,4503,2,3 +5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10 +5063,dtype_cast_373,call_function,dtype_cast.default,backward,15,1,1,1,4504,1,3 +5064,alias_default_1389,call_function,alias.default,backward,15,1,1,0,4505,0,2 +5065,alias_default_1005,call_function,alias.default,unknown,,1,1,3,4512,1653,4 +5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5 +5067,permute_731,call_function,permute.default,backward,14,1,1,1,4,1649,3 +5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5 +5069,permute_732,call_function,permute.default,backward,14,1,1,1,4514,2,4 +5070,dtype_cast_374,call_function,dtype_cast.default,backward,14,1,1,1,4515,1,4 +5071,alias_default_1378,call_function,alias.default,backward,14,1,1,0,4516,0,3 +5072,alias_default_1006,call_function,alias.default,backward,14,1,1,2,4515,1647,4 +5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8 +5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8 +5075,alias_default_1007,call_function,alias.default,backward,14,1,1,2,4517,1634,4 +5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5 +5077,permute_735,call_function,permute.default,backward,14,1,1,1,4,1630,3 +5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5 +5079,permute_736,call_function,permute.default,backward,14,1,1,1,4519,2,4 +5080,dtype_cast_375,call_function,dtype_cast.default,backward,14,1,1,1,4520,1,4 +5081,alias_default_1379,call_function,alias.default,backward,14,1,1,0,4521,0,3 +5082,convert_element_type_1278,call_function,convert_element_type.default,backward,14,1,1,1,4517,1638,6 +5083,convert_element_type_1279,call_function,convert_element_type.default,backward,14,1,1,1,1636,1648,4 +5084,alias_default_1008,call_function,alias.default,backward,14,1,1,2,1637,1647,4 +5085,neg_41,call_function,neg.default,backward,14,1,1,1,1638,1646,8 +5086,exp_41,call_function,exp.default,backward,14,1,1,1,1639,1645,6 +5087,add_232,call_function,add.Tensor,backward,14,1,1,1,1640,1644,4 +5088,reciprocal_13,call_function,reciprocal.default,backward,14,1,1,1,1641,1643,4 +5089,mul_466,call_function,mul.Tensor,backward,14,1,1,1,1642,1642,6 +5090,alias_default_1009,call_function,alias.default,backward,14,1,1,2,1643,1641,4 +5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8 +5092,sub_40,call_function,sub.Tensor,backward,14,1,1,1,1644,1639,4 +5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8 +5094,add_233,call_function,add.Tensor,backward,14,1,1,1,1646,1637,4 +5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8 +5096,convert_element_type_1280,call_function,convert_element_type.default,backward,14,1,1,1,4531,1635,6 +5097,alias_default_1010,call_function,alias.default,backward,14,1,1,2,4532,1634,4 +5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5 +5099,permute_739,call_function,permute.default,backward,14,1,1,1,4,1630,3 +5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5 +5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10 +5102,permute_740,call_function,permute.default,backward,14,1,1,1,4534,2,4 +5103,dtype_cast_376,call_function,dtype_cast.default,backward,14,1,1,1,4535,1,4 +5104,alias_default_1377,call_function,alias.default,backward,14,1,1,0,4536,0,3 +5105,convert_element_type_1285,call_function,convert_element_type.default,backward,14,1,1,1,4540,1627,8 +5106,convert_element_type_1286,call_function,convert_element_type.default,backward,14,1,1,1,1616,1627,4 +5107,convert_element_type_1287,call_function,convert_element_type.default,backward,14,1,1,1,3,1621,2 +5108,alias_default_1011,call_function,alias.default,backward,14,1,1,2,4541,1626,4 +5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8 +5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8 +5111,alias_default_1012,call_function,alias.default,backward,14,1,1,2,4544,1619,4 +5112,alias_default_1013,call_function,alias.default,backward,14,1,1,3,1625,1625,4 +5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8 +5114,sum_55,call_function,sum.dim_IntList,backward,14,1,1,1,4549,1617,5 +5115,div_55,call_function,div.Tensor,backward,14,1,1,1,1626,1617,6 +5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8 +5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10 +5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8 +5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8 +5120,sum_56,call_function,sum.dim_IntList,backward,14,1,1,1,4546,3,5 +5121,convert_element_type_1288,call_function,convert_element_type.default,backward,14,1,1,1,4554,1613,6 +5122,convert_element_type_1289,call_function,convert_element_type.default,backward,14,1,1,1,4547,2,3 +5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10 +5124,dtype_cast_377,call_function,dtype_cast.default,backward,14,1,1,1,4548,1,3 +5125,alias_default_1381,call_function,alias.default,backward,14,1,1,0,4549,0,2 +5126,alias_default_1014,call_function,alias.default,unknown,,1,1,3,4556,1611,4 +5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5 +5128,permute_743,call_function,permute.default,backward,14,1,1,1,4,1607,3 +5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5 +5130,permute_744,call_function,permute.default,backward,14,1,1,1,4558,2,4 +5131,dtype_cast_378,call_function,dtype_cast.default,backward,14,1,1,1,4559,1,4 +5132,alias_default_1376,call_function,alias.default,backward,14,1,1,0,4560,0,3 +5133,view_942,call_function,view.default,backward,14,1,1,1,4559,1605,4 +5134,permute_745,call_function,permute.default,backward,14,1,1,1,4560,1604,4 +5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2 +5136,getitem_291,call_function,getitem,backward,14,1,1,1,4565,1576,2 +5137,getitem_292,call_function,getitem,backward,14,1,1,1,4565,1577,2 +5138,getitem_293,call_function,getitem,backward,14,1,1,1,4565,1570,2 +5139,permute_746,call_function,permute.default,backward,14,1,1,1,4566,1569,2 +5140,permute_747,call_function,permute.default,backward,14,1,1,1,4566,1576,2 +5141,permute_748,call_function,permute.default,backward,14,1,1,1,4566,1575,2 +5142,convert_element_type_1294,call_function,convert_element_type.default,backward,14,1,1,1,4567,1575,2 +5143,convert_element_type_1295,call_function,convert_element_type.default,backward,14,1,1,1,4567,1574,2 +5144,view_943,call_function,view.default,backward,14,1,1,1,4568,1574,2 +5145,view_as_complex_82,call_function,view_as_complex.default,backward,14,1,1,1,4569,1573,6 +5146,_conj_26,call_function,_conj.default,backward,14,1,1,1,4,1574,3 +5147,clone_110,call_function,clone.default,backward,14,1,1,1,5,1573,3 +5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8 +5149,view_944,call_function,view.default,backward,14,1,1,1,4568,1573,2 +5150,view_as_complex_83,call_function,view_as_complex.default,backward,14,1,1,1,4569,1572,6 +5151,_conj_27,call_function,_conj.default,backward,14,1,1,1,4,1573,3 +5152,clone_111,call_function,clone.default,backward,14,1,1,1,5,1572,3 +5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8 +5154,view_as_real_82,call_function,view_as_real.default,backward,14,1,1,1,4573,1571,6 +5155,view_945,call_function,view.default,backward,14,1,1,1,4574,1570,6 +5156,convert_element_type_1296,call_function,convert_element_type.default,backward,14,1,1,1,4575,1569,6 +5157,view_as_real_83,call_function,view_as_real.default,backward,14,1,1,1,4573,1570,6 +5158,view_946,call_function,view.default,backward,14,1,1,1,4574,1569,6 +5159,convert_element_type_1297,call_function,convert_element_type.default,backward,14,1,1,1,4575,1568,6 +5160,view_947,call_function,view.default,backward,14,1,1,1,4567,1568,2 +5161,view_948,call_function,view.default,backward,14,1,1,1,4576,1568,5 +5162,view_949,call_function,view.default,backward,14,1,1,1,4576,1567,5 +5163,alias_default_1015,call_function,alias.default,backward,14,1,1,2,4568,1567,4 +5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5 +5165,permute_751,call_function,permute.default,backward,14,1,1,1,4,1563,3 +5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5 +5167,permute_752,call_function,permute.default,backward,14,1,1,1,4570,2,4 +5168,dtype_cast_379,call_function,dtype_cast.default,backward,14,1,1,1,4571,1,4 +5169,alias_default_1375,call_function,alias.default,backward,14,1,1,0,4572,0,3 +5170,alias_default_1016,call_function,alias.default,backward,14,1,1,2,4577,1567,4 +5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5 +5172,permute_755,call_function,permute.default,backward,14,1,1,1,4,1563,3 +5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5 +5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10 +5175,permute_756,call_function,permute.default,backward,14,1,1,1,4579,2,4 +5176,dtype_cast_380,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4 +5177,alias_default_1374,call_function,alias.default,backward,14,1,1,0,4581,0,3 +5178,alias_default_1017,call_function,alias.default,backward,14,1,1,2,4577,1566,4 +5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5 +5180,permute_759,call_function,permute.default,backward,14,1,1,1,4,1562,3 +5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5 +5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10 +5183,permute_760,call_function,permute.default,backward,14,1,1,1,4579,2,4 +5184,dtype_cast_381,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4 +5185,alias_default_1373,call_function,alias.default,backward,14,1,1,0,4581,0,3 +5186,convert_element_type_1310,call_function,convert_element_type.default,backward,14,1,1,1,4603,1559,8 +5187,convert_element_type_1311,call_function,convert_element_type.default,backward,14,1,1,1,1549,1559,4 +5188,convert_element_type_1312,call_function,convert_element_type.default,backward,14,1,1,1,3,1553,2 +5189,alias_default_1018,call_function,alias.default,backward,14,1,1,2,4604,1558,4 +5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8 +5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8 +5192,alias_default_1019,call_function,alias.default,backward,14,1,1,2,4607,1551,4 +5193,alias_default_1020,call_function,alias.default,backward,14,1,1,3,1558,1557,4 +5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8 +5195,sum_57,call_function,sum.dim_IntList,backward,14,1,1,1,4612,1549,5 +5196,div_56,call_function,div.Tensor,backward,14,1,1,1,1559,1549,6 +5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8 +5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10 +5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8 +5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8 +5201,sum_58,call_function,sum.dim_IntList,backward,14,1,1,1,4609,3,5 +5202,convert_element_type_1313,call_function,convert_element_type.default,backward,14,1,1,1,4617,1545,6 +5203,convert_element_type_1314,call_function,convert_element_type.default,backward,14,1,1,1,4610,2,3 +5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10 +5205,dtype_cast_382,call_function,dtype_cast.default,backward,14,1,1,1,4611,1,3 +5206,alias_default_1380,call_function,alias.default,backward,14,1,1,0,4612,0,2 +5207,alias_default_1021,call_function,alias.default,unknown,,1,1,3,4619,1543,4 +5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5 +5209,permute_763,call_function,permute.default,backward,13,1,1,1,4,1539,3 +5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5 +5211,permute_764,call_function,permute.default,backward,13,1,1,1,4621,2,4 +5212,dtype_cast_383,call_function,dtype_cast.default,backward,13,1,1,1,4622,1,4 +5213,alias_default_1369,call_function,alias.default,backward,13,1,1,0,4623,0,3 +5214,alias_default_1022,call_function,alias.default,backward,13,1,1,2,4622,1537,4 +5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8 +5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8 +5217,alias_default_1023,call_function,alias.default,backward,13,1,1,2,4624,1524,4 +5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5 +5219,permute_767,call_function,permute.default,backward,13,1,1,1,4,1520,3 +5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5 +5221,permute_768,call_function,permute.default,backward,13,1,1,1,4626,2,4 +5222,dtype_cast_384,call_function,dtype_cast.default,backward,13,1,1,1,4627,1,4 +5223,alias_default_1370,call_function,alias.default,backward,13,1,1,0,4628,0,3 +5224,convert_element_type_1323,call_function,convert_element_type.default,backward,13,1,1,1,4624,1528,6 +5225,convert_element_type_1324,call_function,convert_element_type.default,backward,13,1,1,1,1526,1538,4 +5226,alias_default_1024,call_function,alias.default,backward,13,1,1,2,1527,1537,4 +5227,neg_42,call_function,neg.default,backward,13,1,1,1,1528,1536,8 +5228,exp_42,call_function,exp.default,backward,13,1,1,1,1529,1535,6 +5229,add_239,call_function,add.Tensor,backward,13,1,1,1,1530,1534,4 +5230,reciprocal_14,call_function,reciprocal.default,backward,13,1,1,1,1531,1533,4 +5231,mul_486,call_function,mul.Tensor,backward,13,1,1,1,1532,1532,6 +5232,alias_default_1025,call_function,alias.default,backward,13,1,1,2,1533,1531,4 +5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8 +5234,sub_43,call_function,sub.Tensor,backward,13,1,1,1,1534,1529,4 +5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8 +5236,add_240,call_function,add.Tensor,backward,13,1,1,1,1536,1527,4 +5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8 +5238,convert_element_type_1325,call_function,convert_element_type.default,backward,13,1,1,1,4638,1525,6 +5239,alias_default_1026,call_function,alias.default,backward,13,1,1,2,4639,1524,4 +5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5 +5241,permute_771,call_function,permute.default,backward,13,1,1,1,4,1520,3 +5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5 +5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10 +5244,permute_772,call_function,permute.default,backward,13,1,1,1,4641,2,4 +5245,dtype_cast_385,call_function,dtype_cast.default,backward,13,1,1,1,4642,1,4 +5246,alias_default_1368,call_function,alias.default,backward,13,1,1,0,4643,0,3 +5247,convert_element_type_1330,call_function,convert_element_type.default,backward,13,1,1,1,4647,1517,8 +5248,convert_element_type_1331,call_function,convert_element_type.default,backward,13,1,1,1,1506,1517,4 +5249,convert_element_type_1332,call_function,convert_element_type.default,backward,13,1,1,1,3,1511,2 +5250,alias_default_1027,call_function,alias.default,backward,13,1,1,2,4648,1516,4 +5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8 +5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8 +5253,alias_default_1028,call_function,alias.default,backward,13,1,1,2,4651,1509,4 +5254,alias_default_1029,call_function,alias.default,backward,13,1,1,3,1515,1515,4 +5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8 +5256,sum_59,call_function,sum.dim_IntList,backward,13,1,1,1,4656,1507,5 +5257,div_57,call_function,div.Tensor,backward,13,1,1,1,1516,1507,6 +5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8 +5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10 +5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8 +5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8 +5262,sum_60,call_function,sum.dim_IntList,backward,13,1,1,1,4653,3,5 +5263,convert_element_type_1333,call_function,convert_element_type.default,backward,13,1,1,1,4661,1503,6 +5264,convert_element_type_1334,call_function,convert_element_type.default,backward,13,1,1,1,4654,2,3 +5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10 +5266,dtype_cast_386,call_function,dtype_cast.default,backward,13,1,1,1,4655,1,3 +5267,alias_default_1372,call_function,alias.default,backward,13,1,1,0,4656,0,2 +5268,alias_default_1030,call_function,alias.default,unknown,,1,1,3,4663,1501,4 +5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5 +5270,permute_775,call_function,permute.default,backward,13,1,1,1,4,1497,3 +5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5 +5272,permute_776,call_function,permute.default,backward,13,1,1,1,4665,2,4 +5273,dtype_cast_387,call_function,dtype_cast.default,backward,13,1,1,1,4666,1,4 +5274,alias_default_1367,call_function,alias.default,backward,13,1,1,0,4667,0,3 +5275,view_964,call_function,view.default,backward,13,1,1,1,4666,1495,4 +5276,permute_777,call_function,permute.default,backward,13,1,1,1,4667,1494,4 +5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2 +5278,getitem_294,call_function,getitem,backward,13,1,1,1,4672,1466,2 +5279,getitem_295,call_function,getitem,backward,13,1,1,1,4672,1467,2 +5280,getitem_296,call_function,getitem,backward,13,1,1,1,4672,1460,2 +5281,permute_778,call_function,permute.default,backward,13,1,1,1,4673,1459,2 +5282,permute_779,call_function,permute.default,backward,13,1,1,1,4673,1466,2 +5283,permute_780,call_function,permute.default,backward,13,1,1,1,4673,1465,2 +5284,convert_element_type_1339,call_function,convert_element_type.default,backward,13,1,1,1,4674,1465,2 +5285,convert_element_type_1340,call_function,convert_element_type.default,backward,13,1,1,1,4674,1464,2 +5286,view_965,call_function,view.default,backward,13,1,1,1,4675,1464,2 +5287,view_as_complex_84,call_function,view_as_complex.default,backward,13,1,1,1,4676,1463,6 +5288,_conj_28,call_function,_conj.default,backward,13,1,1,1,4,1464,3 +5289,clone_118,call_function,clone.default,backward,13,1,1,1,5,1463,3 +5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8 +5291,view_966,call_function,view.default,backward,13,1,1,1,4675,1463,2 +5292,view_as_complex_85,call_function,view_as_complex.default,backward,13,1,1,1,4676,1462,6 +5293,_conj_29,call_function,_conj.default,backward,13,1,1,1,4,1463,3 +5294,clone_119,call_function,clone.default,backward,13,1,1,1,5,1462,3 +5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8 +5296,view_as_real_84,call_function,view_as_real.default,backward,13,1,1,1,4680,1461,6 +5297,view_967,call_function,view.default,backward,13,1,1,1,4681,1460,6 +5298,convert_element_type_1341,call_function,convert_element_type.default,backward,13,1,1,1,4682,1459,6 +5299,view_as_real_85,call_function,view_as_real.default,backward,13,1,1,1,4680,1460,6 +5300,view_968,call_function,view.default,backward,13,1,1,1,4681,1459,6 +5301,convert_element_type_1342,call_function,convert_element_type.default,backward,13,1,1,1,4682,1458,6 +5302,view_969,call_function,view.default,backward,13,1,1,1,4674,1458,2 +5303,view_970,call_function,view.default,backward,13,1,1,1,4683,1458,5 +5304,view_971,call_function,view.default,backward,13,1,1,1,4683,1457,5 +5305,alias_default_1031,call_function,alias.default,backward,13,1,1,2,4675,1457,4 +5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5 +5307,permute_783,call_function,permute.default,backward,13,1,1,1,4,1453,3 +5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5 +5309,permute_784,call_function,permute.default,backward,13,1,1,1,4677,2,4 +5310,dtype_cast_388,call_function,dtype_cast.default,backward,13,1,1,1,4678,1,4 +5311,alias_default_1366,call_function,alias.default,backward,13,1,1,0,4679,0,3 +5312,alias_default_1032,call_function,alias.default,backward,13,1,1,2,4684,1457,4 +5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5 +5314,permute_787,call_function,permute.default,backward,13,1,1,1,4,1453,3 +5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5 +5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10 +5317,permute_788,call_function,permute.default,backward,13,1,1,1,4686,2,4 +5318,dtype_cast_389,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4 +5319,alias_default_1365,call_function,alias.default,backward,13,1,1,0,4688,0,3 +5320,alias_default_1033,call_function,alias.default,backward,13,1,1,2,4684,1456,4 +5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5 +5322,permute_791,call_function,permute.default,backward,13,1,1,1,4,1452,3 +5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5 +5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10 +5325,permute_792,call_function,permute.default,backward,13,1,1,1,4686,2,4 +5326,dtype_cast_390,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4 +5327,alias_default_1364,call_function,alias.default,backward,13,1,1,0,4688,0,3 +5328,convert_element_type_1355,call_function,convert_element_type.default,backward,13,1,1,1,4710,1449,8 +5329,convert_element_type_1356,call_function,convert_element_type.default,backward,13,1,1,1,1439,1449,4 +5330,convert_element_type_1357,call_function,convert_element_type.default,backward,13,1,1,1,3,1443,2 +5331,alias_default_1034,call_function,alias.default,backward,13,1,1,2,4711,1448,4 +5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8 +5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8 +5334,alias_default_1035,call_function,alias.default,backward,13,1,1,2,4714,1441,4 +5335,alias_default_1036,call_function,alias.default,backward,13,1,1,3,1448,1447,4 +5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8 +5337,sum_61,call_function,sum.dim_IntList,backward,13,1,1,1,4719,1439,5 +5338,div_58,call_function,div.Tensor,backward,13,1,1,1,1449,1439,6 +5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8 +5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10 +5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8 +5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8 +5343,sum_62,call_function,sum.dim_IntList,backward,13,1,1,1,4716,3,5 +5344,convert_element_type_1358,call_function,convert_element_type.default,backward,13,1,1,1,4724,1435,6 +5345,convert_element_type_1359,call_function,convert_element_type.default,backward,13,1,1,1,4717,2,3 +5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10 +5347,dtype_cast_391,call_function,dtype_cast.default,backward,13,1,1,1,4718,1,3 +5348,alias_default_1371,call_function,alias.default,backward,13,1,1,0,4719,0,2 +5349,alias_default_1037,call_function,alias.default,unknown,,1,1,3,4726,1433,4 +5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5 +5351,permute_795,call_function,permute.default,backward,12,1,1,1,4,1429,3 +5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5 +5353,permute_796,call_function,permute.default,backward,12,1,1,1,4728,2,4 +5354,dtype_cast_392,call_function,dtype_cast.default,backward,12,1,1,1,4729,1,4 +5355,alias_default_1360,call_function,alias.default,backward,12,1,1,0,4730,0,3 +5356,alias_default_1038,call_function,alias.default,backward,12,1,1,2,4729,1427,4 +5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8 +5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8 +5359,alias_default_1039,call_function,alias.default,backward,12,1,1,2,4731,1414,4 +5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5 +5361,permute_799,call_function,permute.default,backward,12,1,1,1,4,1410,3 +5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5 +5363,permute_800,call_function,permute.default,backward,12,1,1,1,4733,2,4 +5364,dtype_cast_393,call_function,dtype_cast.default,backward,12,1,1,1,4734,1,4 +5365,alias_default_1361,call_function,alias.default,backward,12,1,1,0,4735,0,3 +5366,convert_element_type_1368,call_function,convert_element_type.default,backward,12,1,1,1,4731,1418,6 +5367,convert_element_type_1369,call_function,convert_element_type.default,backward,12,1,1,1,1416,1428,4 +5368,alias_default_1040,call_function,alias.default,backward,12,1,1,2,1417,1427,4 +5369,neg_43,call_function,neg.default,backward,12,1,1,1,1418,1426,8 +5370,exp_43,call_function,exp.default,backward,12,1,1,1,1419,1425,6 +5371,add_246,call_function,add.Tensor,backward,12,1,1,1,1420,1424,4 +5372,reciprocal_15,call_function,reciprocal.default,backward,12,1,1,1,1421,1423,4 +5373,mul_506,call_function,mul.Tensor,backward,12,1,1,1,1422,1422,6 +5374,alias_default_1041,call_function,alias.default,backward,12,1,1,2,1423,1421,4 +5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8 +5376,sub_46,call_function,sub.Tensor,backward,12,1,1,1,1424,1419,4 +5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8 +5378,add_247,call_function,add.Tensor,backward,12,1,1,1,1426,1417,4 +5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8 +5380,convert_element_type_1370,call_function,convert_element_type.default,backward,12,1,1,1,4745,1415,6 +5381,alias_default_1042,call_function,alias.default,backward,12,1,1,2,4746,1414,4 +5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5 +5383,permute_803,call_function,permute.default,backward,12,1,1,1,4,1410,3 +5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5 +5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10 +5386,permute_804,call_function,permute.default,backward,12,1,1,1,4748,2,4 +5387,dtype_cast_394,call_function,dtype_cast.default,backward,12,1,1,1,4749,1,4 +5388,alias_default_1359,call_function,alias.default,backward,12,1,1,0,4750,0,3 +5389,convert_element_type_1375,call_function,convert_element_type.default,backward,12,1,1,1,4754,1407,8 +5390,convert_element_type_1376,call_function,convert_element_type.default,backward,12,1,1,1,1396,1407,4 +5391,convert_element_type_1377,call_function,convert_element_type.default,backward,12,1,1,1,3,1401,2 +5392,alias_default_1043,call_function,alias.default,backward,12,1,1,2,4755,1406,4 +5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8 +5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8 +5395,alias_default_1044,call_function,alias.default,backward,12,1,1,2,4758,1399,4 +5396,alias_default_1045,call_function,alias.default,backward,12,1,1,3,1405,1405,4 +5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8 +5398,sum_63,call_function,sum.dim_IntList,backward,12,1,1,1,4763,1397,5 +5399,div_59,call_function,div.Tensor,backward,12,1,1,1,1406,1397,6 +5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8 +5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10 +5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8 +5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8 +5404,sum_64,call_function,sum.dim_IntList,backward,12,1,1,1,4760,3,5 +5405,convert_element_type_1378,call_function,convert_element_type.default,backward,12,1,1,1,4768,1393,6 +5406,convert_element_type_1379,call_function,convert_element_type.default,backward,12,1,1,1,4761,2,3 +5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10 +5408,dtype_cast_395,call_function,dtype_cast.default,backward,12,1,1,1,4762,1,3 +5409,alias_default_1363,call_function,alias.default,backward,12,1,1,0,4763,0,2 +5410,alias_default_1046,call_function,alias.default,unknown,,1,1,3,4770,1391,4 +5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5 +5412,permute_807,call_function,permute.default,backward,12,1,1,1,4,1387,3 +5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5 +5414,permute_808,call_function,permute.default,backward,12,1,1,1,4772,2,4 +5415,dtype_cast_396,call_function,dtype_cast.default,backward,12,1,1,1,4773,1,4 +5416,alias_default_1358,call_function,alias.default,backward,12,1,1,0,4774,0,3 +5417,view_986,call_function,view.default,backward,12,1,1,1,4773,1385,4 +5418,permute_809,call_function,permute.default,backward,12,1,1,1,4774,1384,4 +5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2 +5420,getitem_297,call_function,getitem,backward,12,1,1,1,4779,1356,2 +5421,getitem_298,call_function,getitem,backward,12,1,1,1,4779,1357,2 +5422,getitem_299,call_function,getitem,backward,12,1,1,1,4779,1350,2 +5423,permute_810,call_function,permute.default,backward,12,1,1,1,4780,1349,2 +5424,permute_811,call_function,permute.default,backward,12,1,1,1,4780,1356,2 +5425,permute_812,call_function,permute.default,backward,12,1,1,1,4780,1355,2 +5426,convert_element_type_1384,call_function,convert_element_type.default,backward,12,1,1,1,4781,1355,2 +5427,convert_element_type_1385,call_function,convert_element_type.default,backward,12,1,1,1,4781,1354,2 +5428,view_987,call_function,view.default,backward,12,1,1,1,4782,1354,2 +5429,view_as_complex_86,call_function,view_as_complex.default,backward,12,1,1,1,4783,1353,6 +5430,_conj_30,call_function,_conj.default,backward,12,1,1,1,4,1354,3 +5431,clone_126,call_function,clone.default,backward,12,1,1,1,5,1353,3 +5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8 +5433,view_988,call_function,view.default,backward,12,1,1,1,4782,1353,2 +5434,view_as_complex_87,call_function,view_as_complex.default,backward,12,1,1,1,4783,1352,6 +5435,_conj_31,call_function,_conj.default,backward,12,1,1,1,4,1353,3 +5436,clone_127,call_function,clone.default,backward,12,1,1,1,5,1352,3 +5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8 +5438,view_as_real_86,call_function,view_as_real.default,backward,12,1,1,1,4787,1351,6 +5439,view_989,call_function,view.default,backward,12,1,1,1,4788,1350,6 +5440,convert_element_type_1386,call_function,convert_element_type.default,backward,12,1,1,1,4789,1349,6 +5441,view_as_real_87,call_function,view_as_real.default,backward,12,1,1,1,4787,1350,6 +5442,view_990,call_function,view.default,backward,12,1,1,1,4788,1349,6 +5443,convert_element_type_1387,call_function,convert_element_type.default,backward,12,1,1,1,4789,1348,6 +5444,view_991,call_function,view.default,backward,12,1,1,1,4781,1348,2 +5445,view_992,call_function,view.default,backward,12,1,1,1,4790,1348,5 +5446,view_993,call_function,view.default,backward,12,1,1,1,4790,1347,5 +5447,alias_default_1047,call_function,alias.default,backward,12,1,1,2,4782,1347,4 +5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5 +5449,permute_815,call_function,permute.default,backward,12,1,1,1,4,1343,3 +5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5 +5451,permute_816,call_function,permute.default,backward,12,1,1,1,4784,2,4 +5452,dtype_cast_397,call_function,dtype_cast.default,backward,12,1,1,1,4785,1,4 +5453,alias_default_1357,call_function,alias.default,backward,12,1,1,0,4786,0,3 +5454,alias_default_1048,call_function,alias.default,backward,12,1,1,2,4791,1347,4 +5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5 +5456,permute_819,call_function,permute.default,backward,12,1,1,1,4,1343,3 +5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5 +5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10 +5459,permute_820,call_function,permute.default,backward,12,1,1,1,4793,2,4 +5460,dtype_cast_398,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4 +5461,alias_default_1356,call_function,alias.default,backward,12,1,1,0,4795,0,3 +5462,alias_default_1049,call_function,alias.default,backward,12,1,1,2,4791,1346,4 +5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5 +5464,permute_823,call_function,permute.default,backward,12,1,1,1,4,1342,3 +5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5 +5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10 +5467,permute_824,call_function,permute.default,backward,12,1,1,1,4793,2,4 +5468,dtype_cast_399,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4 +5469,alias_default_1355,call_function,alias.default,backward,12,1,1,0,4795,0,3 +5470,convert_element_type_1400,call_function,convert_element_type.default,backward,12,1,1,1,4817,1339,8 +5471,convert_element_type_1401,call_function,convert_element_type.default,backward,12,1,1,1,1329,1339,4 +5472,convert_element_type_1402,call_function,convert_element_type.default,backward,12,1,1,1,3,1333,2 +5473,alias_default_1050,call_function,alias.default,backward,12,1,1,2,4818,1338,4 +5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8 +5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8 +5476,alias_default_1051,call_function,alias.default,backward,12,1,1,2,4821,1331,4 +5477,alias_default_1052,call_function,alias.default,backward,12,1,1,3,1338,1337,4 +5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8 +5479,sum_65,call_function,sum.dim_IntList,backward,12,1,1,1,4826,1329,5 +5480,div_60,call_function,div.Tensor,backward,12,1,1,1,1339,1329,6 +5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8 +5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10 +5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8 +5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8 +5485,sum_66,call_function,sum.dim_IntList,backward,12,1,1,1,4823,3,5 +5486,convert_element_type_1403,call_function,convert_element_type.default,backward,12,1,1,1,4831,1325,6 +5487,convert_element_type_1404,call_function,convert_element_type.default,backward,12,1,1,1,4824,2,3 +5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10 +5489,dtype_cast_400,call_function,dtype_cast.default,backward,12,1,1,1,4825,1,3 +5490,alias_default_1362,call_function,alias.default,backward,12,1,1,0,4826,0,2 +5491,alias_default_1053,call_function,alias.default,unknown,,1,1,3,4833,1323,4 +5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5 +5493,permute_827,call_function,permute.default,backward,11,1,1,1,4,1319,3 +5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5 +5495,permute_828,call_function,permute.default,backward,11,1,1,1,4835,2,4 +5496,dtype_cast_401,call_function,dtype_cast.default,backward,11,1,1,1,4836,1,4 +5497,alias_default_1351,call_function,alias.default,backward,11,1,1,0,4837,0,3 +5498,alias_default_1054,call_function,alias.default,backward,11,1,1,2,4836,1317,4 +5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8 +5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8 +5501,alias_default_1055,call_function,alias.default,backward,11,1,1,2,4838,1304,4 +5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5 +5503,permute_831,call_function,permute.default,backward,11,1,1,1,4,1300,3 +5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5 +5505,permute_832,call_function,permute.default,backward,11,1,1,1,4840,2,4 +5506,dtype_cast_402,call_function,dtype_cast.default,backward,11,1,1,1,4841,1,4 +5507,alias_default_1352,call_function,alias.default,backward,11,1,1,0,4842,0,3 +5508,convert_element_type_1413,call_function,convert_element_type.default,backward,11,1,1,1,4838,1308,6 +5509,convert_element_type_1414,call_function,convert_element_type.default,backward,11,1,1,1,1306,1318,4 +5510,alias_default_1056,call_function,alias.default,backward,11,1,1,2,1307,1317,4 +5511,neg_44,call_function,neg.default,backward,11,1,1,1,1308,1316,8 +5512,exp_44,call_function,exp.default,backward,11,1,1,1,1309,1315,6 +5513,add_253,call_function,add.Tensor,backward,11,1,1,1,1310,1314,4 +5514,reciprocal_16,call_function,reciprocal.default,backward,11,1,1,1,1311,1313,4 +5515,mul_526,call_function,mul.Tensor,backward,11,1,1,1,1312,1312,6 +5516,alias_default_1057,call_function,alias.default,backward,11,1,1,2,1313,1311,4 +5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8 +5518,sub_49,call_function,sub.Tensor,backward,11,1,1,1,1314,1309,4 +5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8 +5520,add_254,call_function,add.Tensor,backward,11,1,1,1,1316,1307,4 +5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8 +5522,convert_element_type_1415,call_function,convert_element_type.default,backward,11,1,1,1,4852,1305,6 +5523,alias_default_1058,call_function,alias.default,backward,11,1,1,2,4853,1304,4 +5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5 +5525,permute_835,call_function,permute.default,backward,11,1,1,1,4,1300,3 +5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5 +5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10 +5528,permute_836,call_function,permute.default,backward,11,1,1,1,4855,2,4 +5529,dtype_cast_403,call_function,dtype_cast.default,backward,11,1,1,1,4856,1,4 +5530,alias_default_1350,call_function,alias.default,backward,11,1,1,0,4857,0,3 +5531,convert_element_type_1420,call_function,convert_element_type.default,backward,11,1,1,1,4861,1297,8 +5532,convert_element_type_1421,call_function,convert_element_type.default,backward,11,1,1,1,1286,1297,4 +5533,convert_element_type_1422,call_function,convert_element_type.default,backward,11,1,1,1,3,1291,2 +5534,alias_default_1059,call_function,alias.default,backward,11,1,1,2,4862,1296,4 +5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8 +5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8 +5537,alias_default_1060,call_function,alias.default,backward,11,1,1,2,4865,1289,4 +5538,alias_default_1061,call_function,alias.default,backward,11,1,1,3,1295,1295,4 +5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8 +5540,sum_67,call_function,sum.dim_IntList,backward,11,1,1,1,4870,1287,5 +5541,div_61,call_function,div.Tensor,backward,11,1,1,1,1296,1287,6 +5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8 +5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10 +5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8 +5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8 +5546,sum_68,call_function,sum.dim_IntList,backward,11,1,1,1,4867,3,5 +5547,convert_element_type_1423,call_function,convert_element_type.default,backward,11,1,1,1,4875,1283,6 +5548,convert_element_type_1424,call_function,convert_element_type.default,backward,11,1,1,1,4868,2,3 +5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10 +5550,dtype_cast_404,call_function,dtype_cast.default,backward,11,1,1,1,4869,1,3 +5551,alias_default_1354,call_function,alias.default,backward,11,1,1,0,4870,0,2 +5552,alias_default_1062,call_function,alias.default,unknown,,1,1,3,4877,1281,4 +5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5 +5554,permute_839,call_function,permute.default,backward,11,1,1,1,4,1277,3 +5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5 +5556,permute_840,call_function,permute.default,backward,11,1,1,1,4879,2,4 +5557,dtype_cast_405,call_function,dtype_cast.default,backward,11,1,1,1,4880,1,4 +5558,alias_default_1349,call_function,alias.default,backward,11,1,1,0,4881,0,3 +5559,view_1008,call_function,view.default,backward,11,1,1,1,4880,1275,4 +5560,permute_841,call_function,permute.default,backward,11,1,1,1,4881,1274,4 +5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2 +5562,getitem_300,call_function,getitem,backward,11,1,1,1,4886,1246,2 +5563,getitem_301,call_function,getitem,backward,11,1,1,1,4886,1247,2 +5564,getitem_302,call_function,getitem,backward,11,1,1,1,4886,1240,2 +5565,permute_842,call_function,permute.default,backward,11,1,1,1,4887,1239,2 +5566,permute_843,call_function,permute.default,backward,11,1,1,1,4887,1246,2 +5567,permute_844,call_function,permute.default,backward,11,1,1,1,4887,1245,2 +5568,convert_element_type_1429,call_function,convert_element_type.default,backward,11,1,1,1,4888,1245,2 +5569,convert_element_type_1430,call_function,convert_element_type.default,backward,11,1,1,1,4888,1244,2 +5570,view_1009,call_function,view.default,backward,11,1,1,1,4889,1244,2 +5571,view_as_complex_88,call_function,view_as_complex.default,backward,11,1,1,1,4890,1243,6 +5572,_conj_32,call_function,_conj.default,backward,11,1,1,1,4,1244,3 +5573,clone_134,call_function,clone.default,backward,11,1,1,1,5,1243,3 +5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8 +5575,view_1010,call_function,view.default,backward,11,1,1,1,4889,1243,2 +5576,view_as_complex_89,call_function,view_as_complex.default,backward,11,1,1,1,4890,1242,6 +5577,_conj_33,call_function,_conj.default,backward,11,1,1,1,4,1243,3 +5578,clone_135,call_function,clone.default,backward,11,1,1,1,5,1242,3 +5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8 +5580,view_as_real_88,call_function,view_as_real.default,backward,11,1,1,1,4894,1241,6 +5581,view_1011,call_function,view.default,backward,11,1,1,1,4895,1240,6 +5582,convert_element_type_1431,call_function,convert_element_type.default,backward,11,1,1,1,4896,1239,6 +5583,view_as_real_89,call_function,view_as_real.default,backward,11,1,1,1,4894,1240,6 +5584,view_1012,call_function,view.default,backward,11,1,1,1,4895,1239,6 +5585,convert_element_type_1432,call_function,convert_element_type.default,backward,11,1,1,1,4896,1238,6 +5586,view_1013,call_function,view.default,backward,11,1,1,1,4888,1238,2 +5587,view_1014,call_function,view.default,backward,11,1,1,1,4897,1238,5 +5588,view_1015,call_function,view.default,backward,11,1,1,1,4897,1237,5 +5589,alias_default_1063,call_function,alias.default,backward,11,1,1,2,4889,1237,4 +5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5 +5591,permute_847,call_function,permute.default,backward,11,1,1,1,4,1233,3 +5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5 +5593,permute_848,call_function,permute.default,backward,11,1,1,1,4891,2,4 +5594,dtype_cast_406,call_function,dtype_cast.default,backward,11,1,1,1,4892,1,4 +5595,alias_default_1348,call_function,alias.default,backward,11,1,1,0,4893,0,3 +5596,alias_default_1064,call_function,alias.default,backward,11,1,1,2,4898,1237,4 +5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5 +5598,permute_851,call_function,permute.default,backward,11,1,1,1,4,1233,3 +5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5 +5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10 +5601,permute_852,call_function,permute.default,backward,11,1,1,1,4900,2,4 +5602,dtype_cast_407,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4 +5603,alias_default_1347,call_function,alias.default,backward,11,1,1,0,4902,0,3 +5604,alias_default_1065,call_function,alias.default,backward,11,1,1,2,4898,1236,4 +5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5 +5606,permute_855,call_function,permute.default,backward,11,1,1,1,4,1232,3 +5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5 +5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10 +5609,permute_856,call_function,permute.default,backward,11,1,1,1,4900,2,4 +5610,dtype_cast_408,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4 +5611,alias_default_1346,call_function,alias.default,backward,11,1,1,0,4902,0,3 +5612,convert_element_type_1445,call_function,convert_element_type.default,backward,11,1,1,1,4924,1229,8 +5613,convert_element_type_1446,call_function,convert_element_type.default,backward,11,1,1,1,1219,1229,4 +5614,convert_element_type_1447,call_function,convert_element_type.default,backward,11,1,1,1,3,1223,2 +5615,alias_default_1066,call_function,alias.default,backward,11,1,1,2,4925,1228,4 +5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8 +5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8 +5618,alias_default_1067,call_function,alias.default,backward,11,1,1,2,4928,1221,4 +5619,alias_default_1068,call_function,alias.default,backward,11,1,1,3,1228,1227,4 +5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8 +5621,sum_69,call_function,sum.dim_IntList,backward,11,1,1,1,4933,1219,5 +5622,div_62,call_function,div.Tensor,backward,11,1,1,1,1229,1219,6 +5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8 +5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10 +5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8 +5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8 +5627,sum_70,call_function,sum.dim_IntList,backward,11,1,1,1,4930,3,5 +5628,convert_element_type_1448,call_function,convert_element_type.default,backward,11,1,1,1,4938,1215,6 +5629,convert_element_type_1449,call_function,convert_element_type.default,backward,11,1,1,1,4931,2,3 +5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10 +5631,dtype_cast_409,call_function,dtype_cast.default,backward,11,1,1,1,4932,1,3 +5632,alias_default_1353,call_function,alias.default,backward,11,1,1,0,4933,0,2 +5633,alias_default_1069,call_function,alias.default,unknown,,1,1,3,4940,1213,4 +5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5 +5635,permute_859,call_function,permute.default,backward,10,1,1,1,4,1209,3 +5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5 +5637,permute_860,call_function,permute.default,backward,10,1,1,1,4942,2,4 +5638,dtype_cast_410,call_function,dtype_cast.default,backward,10,1,1,1,4943,1,4 +5639,alias_default_1342,call_function,alias.default,backward,10,1,1,0,4944,0,3 +5640,alias_default_1070,call_function,alias.default,backward,10,1,1,2,4943,1207,4 +5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8 +5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8 +5643,alias_default_1071,call_function,alias.default,backward,10,1,1,2,4945,1194,4 +5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5 +5645,permute_863,call_function,permute.default,backward,10,1,1,1,4,1190,3 +5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5 +5647,permute_864,call_function,permute.default,backward,10,1,1,1,4947,2,4 +5648,dtype_cast_411,call_function,dtype_cast.default,backward,10,1,1,1,4948,1,4 +5649,alias_default_1343,call_function,alias.default,backward,10,1,1,0,4949,0,3 +5650,convert_element_type_1458,call_function,convert_element_type.default,backward,10,1,1,1,4945,1198,6 +5651,convert_element_type_1459,call_function,convert_element_type.default,backward,10,1,1,1,1196,1208,4 +5652,alias_default_1072,call_function,alias.default,backward,10,1,1,2,1197,1207,4 +5653,neg_45,call_function,neg.default,backward,10,1,1,1,1198,1206,8 +5654,exp_45,call_function,exp.default,backward,10,1,1,1,1199,1205,6 +5655,add_260,call_function,add.Tensor,backward,10,1,1,1,1200,1204,4 +5656,reciprocal_17,call_function,reciprocal.default,backward,10,1,1,1,1201,1203,4 +5657,mul_546,call_function,mul.Tensor,backward,10,1,1,1,1202,1202,6 +5658,alias_default_1073,call_function,alias.default,backward,10,1,1,2,1203,1201,4 +5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8 +5660,sub_52,call_function,sub.Tensor,backward,10,1,1,1,1204,1199,4 +5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8 +5662,add_261,call_function,add.Tensor,backward,10,1,1,1,1206,1197,4 +5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8 +5664,convert_element_type_1460,call_function,convert_element_type.default,backward,10,1,1,1,4959,1195,6 +5665,alias_default_1074,call_function,alias.default,backward,10,1,1,2,4960,1194,4 +5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5 +5667,permute_867,call_function,permute.default,backward,10,1,1,1,4,1190,3 +5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5 +5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10 +5670,permute_868,call_function,permute.default,backward,10,1,1,1,4962,2,4 +5671,dtype_cast_412,call_function,dtype_cast.default,backward,10,1,1,1,4963,1,4 +5672,alias_default_1341,call_function,alias.default,backward,10,1,1,0,4964,0,3 +5673,convert_element_type_1465,call_function,convert_element_type.default,backward,10,1,1,1,4968,1187,8 +5674,convert_element_type_1466,call_function,convert_element_type.default,backward,10,1,1,1,1176,1187,4 +5675,convert_element_type_1467,call_function,convert_element_type.default,backward,10,1,1,1,3,1181,2 +5676,alias_default_1075,call_function,alias.default,backward,10,1,1,2,4969,1186,4 +5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8 +5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8 +5679,alias_default_1076,call_function,alias.default,backward,10,1,1,2,4972,1179,4 +5680,alias_default_1077,call_function,alias.default,backward,10,1,1,3,1185,1185,4 +5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8 +5682,sum_71,call_function,sum.dim_IntList,backward,10,1,1,1,4977,1177,5 +5683,div_63,call_function,div.Tensor,backward,10,1,1,1,1186,1177,6 +5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8 +5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10 +5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8 +5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8 +5688,sum_72,call_function,sum.dim_IntList,backward,10,1,1,1,4974,3,5 +5689,convert_element_type_1468,call_function,convert_element_type.default,backward,10,1,1,1,4982,1173,6 +5690,convert_element_type_1469,call_function,convert_element_type.default,backward,10,1,1,1,4975,2,3 +5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10 +5692,dtype_cast_413,call_function,dtype_cast.default,backward,10,1,1,1,4976,1,3 +5693,alias_default_1345,call_function,alias.default,backward,10,1,1,0,4977,0,2 +5694,alias_default_1078,call_function,alias.default,unknown,,1,1,3,4984,1171,4 +5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5 +5696,permute_871,call_function,permute.default,backward,10,1,1,1,4,1167,3 +5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5 +5698,permute_872,call_function,permute.default,backward,10,1,1,1,4986,2,4 +5699,dtype_cast_414,call_function,dtype_cast.default,backward,10,1,1,1,4987,1,4 +5700,alias_default_1340,call_function,alias.default,backward,10,1,1,0,4988,0,3 +5701,view_1030,call_function,view.default,backward,10,1,1,1,4987,1165,4 +5702,permute_873,call_function,permute.default,backward,10,1,1,1,4988,1164,4 +5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2 +5704,getitem_303,call_function,getitem,backward,10,1,1,1,4993,1136,2 +5705,getitem_304,call_function,getitem,backward,10,1,1,1,4993,1137,2 +5706,getitem_305,call_function,getitem,backward,10,1,1,1,4993,1130,2 +5707,permute_874,call_function,permute.default,backward,10,1,1,1,4994,1129,2 +5708,permute_875,call_function,permute.default,backward,10,1,1,1,4994,1136,2 +5709,permute_876,call_function,permute.default,backward,10,1,1,1,4994,1135,2 +5710,convert_element_type_1474,call_function,convert_element_type.default,backward,10,1,1,1,4995,1135,2 +5711,convert_element_type_1475,call_function,convert_element_type.default,backward,10,1,1,1,4995,1134,2 +5712,view_1031,call_function,view.default,backward,10,1,1,1,4996,1134,2 +5713,view_as_complex_90,call_function,view_as_complex.default,backward,10,1,1,1,4997,1133,6 +5714,_conj_34,call_function,_conj.default,backward,10,1,1,1,4,1134,3 +5715,clone_142,call_function,clone.default,backward,10,1,1,1,5,1133,3 +5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8 +5717,view_1032,call_function,view.default,backward,10,1,1,1,4996,1133,2 +5718,view_as_complex_91,call_function,view_as_complex.default,backward,10,1,1,1,4997,1132,6 +5719,_conj_35,call_function,_conj.default,backward,10,1,1,1,4,1133,3 +5720,clone_143,call_function,clone.default,backward,10,1,1,1,5,1132,3 +5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8 +5722,view_as_real_90,call_function,view_as_real.default,backward,10,1,1,1,5001,1131,6 +5723,view_1033,call_function,view.default,backward,10,1,1,1,5002,1130,6 +5724,convert_element_type_1476,call_function,convert_element_type.default,backward,10,1,1,1,5003,1129,6 +5725,view_as_real_91,call_function,view_as_real.default,backward,10,1,1,1,5001,1130,6 +5726,view_1034,call_function,view.default,backward,10,1,1,1,5002,1129,6 +5727,convert_element_type_1477,call_function,convert_element_type.default,backward,10,1,1,1,5003,1128,6 +5728,view_1035,call_function,view.default,backward,10,1,1,1,4995,1128,2 +5729,view_1036,call_function,view.default,backward,10,1,1,1,5004,1128,5 +5730,view_1037,call_function,view.default,backward,10,1,1,1,5004,1127,5 +5731,alias_default_1079,call_function,alias.default,backward,10,1,1,2,4996,1127,4 +5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5 +5733,permute_879,call_function,permute.default,backward,10,1,1,1,4,1123,3 +5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5 +5735,permute_880,call_function,permute.default,backward,10,1,1,1,4998,2,4 +5736,dtype_cast_415,call_function,dtype_cast.default,backward,10,1,1,1,4999,1,4 +5737,alias_default_1339,call_function,alias.default,backward,10,1,1,0,5000,0,3 +5738,alias_default_1080,call_function,alias.default,backward,10,1,1,2,5005,1127,4 +5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5 +5740,permute_883,call_function,permute.default,backward,10,1,1,1,4,1123,3 +5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5 +5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10 +5743,permute_884,call_function,permute.default,backward,10,1,1,1,5007,2,4 +5744,dtype_cast_416,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4 +5745,alias_default_1338,call_function,alias.default,backward,10,1,1,0,5009,0,3 +5746,alias_default_1081,call_function,alias.default,backward,10,1,1,2,5005,1126,4 +5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5 +5748,permute_887,call_function,permute.default,backward,10,1,1,1,4,1122,3 +5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5 +5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10 +5751,permute_888,call_function,permute.default,backward,10,1,1,1,5007,2,4 +5752,dtype_cast_417,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4 +5753,alias_default_1337,call_function,alias.default,backward,10,1,1,0,5009,0,3 +5754,convert_element_type_1490,call_function,convert_element_type.default,backward,10,1,1,1,5031,1119,8 +5755,convert_element_type_1491,call_function,convert_element_type.default,backward,10,1,1,1,1109,1119,4 +5756,convert_element_type_1492,call_function,convert_element_type.default,backward,10,1,1,1,3,1113,2 +5757,alias_default_1082,call_function,alias.default,backward,10,1,1,2,5032,1118,4 +5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8 +5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8 +5760,alias_default_1083,call_function,alias.default,backward,10,1,1,2,5035,1111,4 +5761,alias_default_1084,call_function,alias.default,backward,10,1,1,3,1118,1117,4 +5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8 +5763,sum_73,call_function,sum.dim_IntList,backward,10,1,1,1,5040,1109,5 +5764,div_64,call_function,div.Tensor,backward,10,1,1,1,1119,1109,6 +5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8 +5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10 +5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8 +5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8 +5769,sum_74,call_function,sum.dim_IntList,backward,10,1,1,1,5037,3,5 +5770,convert_element_type_1493,call_function,convert_element_type.default,backward,10,1,1,1,5045,1105,6 +5771,convert_element_type_1494,call_function,convert_element_type.default,backward,10,1,1,1,5038,2,3 +5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10 +5773,dtype_cast_418,call_function,dtype_cast.default,backward,10,1,1,1,5039,1,3 +5774,alias_default_1344,call_function,alias.default,backward,10,1,1,0,5040,0,2 +5775,alias_default_1085,call_function,alias.default,unknown,,1,1,3,5047,1103,4 +5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5 +5777,permute_891,call_function,permute.default,backward,9,1,1,1,4,1099,3 +5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5 +5779,permute_892,call_function,permute.default,backward,9,1,1,1,5049,2,4 +5780,dtype_cast_419,call_function,dtype_cast.default,backward,9,1,1,1,5050,1,4 +5781,alias_default_1333,call_function,alias.default,backward,9,1,1,0,5051,0,3 +5782,alias_default_1086,call_function,alias.default,backward,9,1,1,2,5050,1097,4 +5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8 +5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8 +5785,alias_default_1087,call_function,alias.default,backward,9,1,1,2,5052,1084,4 +5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5 +5787,permute_895,call_function,permute.default,backward,9,1,1,1,4,1080,3 +5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5 +5789,permute_896,call_function,permute.default,backward,9,1,1,1,5054,2,4 +5790,dtype_cast_420,call_function,dtype_cast.default,backward,9,1,1,1,5055,1,4 +5791,alias_default_1334,call_function,alias.default,backward,9,1,1,0,5056,0,3 +5792,convert_element_type_1503,call_function,convert_element_type.default,backward,9,1,1,1,5052,1088,6 +5793,convert_element_type_1504,call_function,convert_element_type.default,backward,9,1,1,1,1086,1098,4 +5794,alias_default_1088,call_function,alias.default,backward,9,1,1,2,1087,1097,4 +5795,neg_46,call_function,neg.default,backward,9,1,1,1,1088,1096,8 +5796,exp_46,call_function,exp.default,backward,9,1,1,1,1089,1095,6 +5797,add_267,call_function,add.Tensor,backward,9,1,1,1,1090,1094,4 +5798,reciprocal_18,call_function,reciprocal.default,backward,9,1,1,1,1091,1093,4 +5799,mul_566,call_function,mul.Tensor,backward,9,1,1,1,1092,1092,6 +5800,alias_default_1089,call_function,alias.default,backward,9,1,1,2,1093,1091,4 +5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8 +5802,sub_55,call_function,sub.Tensor,backward,9,1,1,1,1094,1089,4 +5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8 +5804,add_268,call_function,add.Tensor,backward,9,1,1,1,1096,1087,4 +5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8 +5806,convert_element_type_1505,call_function,convert_element_type.default,backward,9,1,1,1,5066,1085,6 +5807,alias_default_1090,call_function,alias.default,backward,9,1,1,2,5067,1084,4 +5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5 +5809,permute_899,call_function,permute.default,backward,9,1,1,1,4,1080,3 +5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5 +5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10 +5812,permute_900,call_function,permute.default,backward,9,1,1,1,5069,2,4 +5813,dtype_cast_421,call_function,dtype_cast.default,backward,9,1,1,1,5070,1,4 +5814,alias_default_1332,call_function,alias.default,backward,9,1,1,0,5071,0,3 +5815,convert_element_type_1510,call_function,convert_element_type.default,backward,9,1,1,1,5075,1077,8 +5816,convert_element_type_1511,call_function,convert_element_type.default,backward,9,1,1,1,1066,1077,4 +5817,convert_element_type_1512,call_function,convert_element_type.default,backward,9,1,1,1,3,1071,2 +5818,alias_default_1091,call_function,alias.default,backward,9,1,1,2,5076,1076,4 +5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8 +5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8 +5821,alias_default_1092,call_function,alias.default,backward,9,1,1,2,5079,1069,4 +5822,alias_default_1093,call_function,alias.default,backward,9,1,1,3,1075,1075,4 +5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8 +5824,sum_75,call_function,sum.dim_IntList,backward,9,1,1,1,5084,1067,5 +5825,div_65,call_function,div.Tensor,backward,9,1,1,1,1076,1067,6 +5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8 +5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10 +5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8 +5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8 +5830,sum_76,call_function,sum.dim_IntList,backward,9,1,1,1,5081,3,5 +5831,convert_element_type_1513,call_function,convert_element_type.default,backward,9,1,1,1,5089,1063,6 +5832,convert_element_type_1514,call_function,convert_element_type.default,backward,9,1,1,1,5082,2,3 +5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10 +5834,dtype_cast_422,call_function,dtype_cast.default,backward,9,1,1,1,5083,1,3 +5835,alias_default_1336,call_function,alias.default,backward,9,1,1,0,5084,0,2 +5836,alias_default_1094,call_function,alias.default,unknown,,1,1,3,5091,1061,4 +5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5 +5838,permute_903,call_function,permute.default,backward,9,1,1,1,4,1057,3 +5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5 +5840,permute_904,call_function,permute.default,backward,9,1,1,1,5093,2,4 +5841,dtype_cast_423,call_function,dtype_cast.default,backward,9,1,1,1,5094,1,4 +5842,alias_default_1331,call_function,alias.default,backward,9,1,1,0,5095,0,3 +5843,view_1052,call_function,view.default,backward,9,1,1,1,5094,1055,4 +5844,permute_905,call_function,permute.default,backward,9,1,1,1,5095,1054,4 +5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2 +5846,getitem_306,call_function,getitem,backward,9,1,1,1,5100,1026,2 +5847,getitem_307,call_function,getitem,backward,9,1,1,1,5100,1027,2 +5848,getitem_308,call_function,getitem,backward,9,1,1,1,5100,1020,2 +5849,permute_906,call_function,permute.default,backward,9,1,1,1,5101,1019,2 +5850,permute_907,call_function,permute.default,backward,9,1,1,1,5101,1026,2 +5851,permute_908,call_function,permute.default,backward,9,1,1,1,5101,1025,2 +5852,convert_element_type_1519,call_function,convert_element_type.default,backward,9,1,1,1,5102,1025,2 +5853,convert_element_type_1520,call_function,convert_element_type.default,backward,9,1,1,1,5102,1024,2 +5854,view_1053,call_function,view.default,backward,9,1,1,1,5103,1024,2 +5855,view_as_complex_92,call_function,view_as_complex.default,backward,9,1,1,1,5104,1023,6 +5856,_conj_36,call_function,_conj.default,backward,9,1,1,1,4,1024,3 +5857,clone_150,call_function,clone.default,backward,9,1,1,1,5,1023,3 +5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8 +5859,view_1054,call_function,view.default,backward,9,1,1,1,5103,1023,2 +5860,view_as_complex_93,call_function,view_as_complex.default,backward,9,1,1,1,5104,1022,6 +5861,_conj_37,call_function,_conj.default,backward,9,1,1,1,4,1023,3 +5862,clone_151,call_function,clone.default,backward,9,1,1,1,5,1022,3 +5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8 +5864,view_as_real_92,call_function,view_as_real.default,backward,9,1,1,1,5108,1021,6 +5865,view_1055,call_function,view.default,backward,9,1,1,1,5109,1020,6 +5866,convert_element_type_1521,call_function,convert_element_type.default,backward,9,1,1,1,5110,1019,6 +5867,view_as_real_93,call_function,view_as_real.default,backward,9,1,1,1,5108,1020,6 +5868,view_1056,call_function,view.default,backward,9,1,1,1,5109,1019,6 +5869,convert_element_type_1522,call_function,convert_element_type.default,backward,9,1,1,1,5110,1018,6 +5870,view_1057,call_function,view.default,backward,9,1,1,1,5102,1018,2 +5871,view_1058,call_function,view.default,backward,9,1,1,1,5111,1018,5 +5872,view_1059,call_function,view.default,backward,9,1,1,1,5111,1017,5 +5873,alias_default_1095,call_function,alias.default,backward,9,1,1,2,5103,1017,4 +5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5 +5875,permute_911,call_function,permute.default,backward,9,1,1,1,4,1013,3 +5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5 +5877,permute_912,call_function,permute.default,backward,9,1,1,1,5105,2,4 +5878,dtype_cast_424,call_function,dtype_cast.default,backward,9,1,1,1,5106,1,4 +5879,alias_default_1330,call_function,alias.default,backward,9,1,1,0,5107,0,3 +5880,alias_default_1096,call_function,alias.default,backward,9,1,1,2,5112,1017,4 +5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5 +5882,permute_915,call_function,permute.default,backward,9,1,1,1,4,1013,3 +5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5 +5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10 +5885,permute_916,call_function,permute.default,backward,9,1,1,1,5114,2,4 +5886,dtype_cast_425,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4 +5887,alias_default_1329,call_function,alias.default,backward,9,1,1,0,5116,0,3 +5888,alias_default_1097,call_function,alias.default,backward,9,1,1,2,5112,1016,4 +5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5 +5890,permute_919,call_function,permute.default,backward,9,1,1,1,4,1012,3 +5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5 +5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10 +5893,permute_920,call_function,permute.default,backward,9,1,1,1,5114,2,4 +5894,dtype_cast_426,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4 +5895,alias_default_1328,call_function,alias.default,backward,9,1,1,0,5116,0,3 +5896,convert_element_type_1535,call_function,convert_element_type.default,backward,9,1,1,1,5138,1009,8 +5897,convert_element_type_1536,call_function,convert_element_type.default,backward,9,1,1,1,999,1009,4 +5898,convert_element_type_1537,call_function,convert_element_type.default,backward,9,1,1,1,3,1003,2 +5899,alias_default_1098,call_function,alias.default,backward,9,1,1,2,5139,1008,4 +5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8 +5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8 +5902,alias_default_1099,call_function,alias.default,backward,9,1,1,2,5142,1001,4 +5903,alias_default_1100,call_function,alias.default,backward,9,1,1,3,1008,1007,4 +5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8 +5905,sum_77,call_function,sum.dim_IntList,backward,9,1,1,1,5147,999,5 +5906,div_66,call_function,div.Tensor,backward,9,1,1,1,1009,999,6 +5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8 +5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10 +5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8 +5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8 +5911,sum_78,call_function,sum.dim_IntList,backward,9,1,1,1,5144,3,5 +5912,convert_element_type_1538,call_function,convert_element_type.default,backward,9,1,1,1,5152,995,6 +5913,convert_element_type_1539,call_function,convert_element_type.default,backward,9,1,1,1,5145,2,3 +5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10 +5915,dtype_cast_427,call_function,dtype_cast.default,backward,9,1,1,1,5146,1,3 +5916,alias_default_1335,call_function,alias.default,backward,9,1,1,0,5147,0,2 +5917,alias_default_1101,call_function,alias.default,unknown,,1,1,3,5154,993,4 +5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5 +5919,permute_923,call_function,permute.default,backward,8,1,1,1,4,989,3 +5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5 +5921,permute_924,call_function,permute.default,backward,8,1,1,1,5156,2,4 +5922,dtype_cast_428,call_function,dtype_cast.default,backward,8,1,1,1,5157,1,4 +5923,alias_default_1324,call_function,alias.default,backward,8,1,1,0,5158,0,3 +5924,alias_default_1102,call_function,alias.default,backward,8,1,1,2,5157,987,4 +5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8 +5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8 +5927,alias_default_1103,call_function,alias.default,backward,8,1,1,2,5159,974,4 +5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5 +5929,permute_927,call_function,permute.default,backward,8,1,1,1,4,970,3 +5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5 +5931,permute_928,call_function,permute.default,backward,8,1,1,1,5161,2,4 +5932,dtype_cast_429,call_function,dtype_cast.default,backward,8,1,1,1,5162,1,4 +5933,alias_default_1325,call_function,alias.default,backward,8,1,1,0,5163,0,3 +5934,convert_element_type_1548,call_function,convert_element_type.default,backward,8,1,1,1,5159,978,6 +5935,convert_element_type_1549,call_function,convert_element_type.default,backward,8,1,1,1,976,988,4 +5936,alias_default_1104,call_function,alias.default,backward,8,1,1,2,977,987,4 +5937,neg_47,call_function,neg.default,backward,8,1,1,1,978,986,8 +5938,exp_47,call_function,exp.default,backward,8,1,1,1,979,985,6 +5939,add_274,call_function,add.Tensor,backward,8,1,1,1,980,984,4 +5940,reciprocal_19,call_function,reciprocal.default,backward,8,1,1,1,981,983,4 +5941,mul_586,call_function,mul.Tensor,backward,8,1,1,1,982,982,6 +5942,alias_default_1105,call_function,alias.default,backward,8,1,1,2,983,981,4 +5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8 +5944,sub_58,call_function,sub.Tensor,backward,8,1,1,1,984,979,4 +5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8 +5946,add_275,call_function,add.Tensor,backward,8,1,1,1,986,977,4 +5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8 +5948,convert_element_type_1550,call_function,convert_element_type.default,backward,8,1,1,1,5173,975,6 +5949,alias_default_1106,call_function,alias.default,backward,8,1,1,2,5174,974,4 +5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5 +5951,permute_931,call_function,permute.default,backward,8,1,1,1,4,970,3 +5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5 +5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10 +5954,permute_932,call_function,permute.default,backward,8,1,1,1,5176,2,4 +5955,dtype_cast_430,call_function,dtype_cast.default,backward,8,1,1,1,5177,1,4 +5956,alias_default_1323,call_function,alias.default,backward,8,1,1,0,5178,0,3 +5957,convert_element_type_1555,call_function,convert_element_type.default,backward,8,1,1,1,5182,967,8 +5958,convert_element_type_1556,call_function,convert_element_type.default,backward,8,1,1,1,956,967,4 +5959,convert_element_type_1557,call_function,convert_element_type.default,backward,8,1,1,1,3,961,2 +5960,alias_default_1107,call_function,alias.default,backward,8,1,1,2,5183,966,4 +5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8 +5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8 +5963,alias_default_1108,call_function,alias.default,backward,8,1,1,2,5186,959,4 +5964,alias_default_1109,call_function,alias.default,backward,8,1,1,3,965,965,4 +5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8 +5966,sum_79,call_function,sum.dim_IntList,backward,8,1,1,1,5191,957,5 +5967,div_67,call_function,div.Tensor,backward,8,1,1,1,966,957,6 +5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8 +5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10 +5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8 +5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8 +5972,sum_80,call_function,sum.dim_IntList,backward,8,1,1,1,5188,3,5 +5973,convert_element_type_1558,call_function,convert_element_type.default,backward,8,1,1,1,5196,953,6 +5974,convert_element_type_1559,call_function,convert_element_type.default,backward,8,1,1,1,5189,2,3 +5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10 +5976,dtype_cast_431,call_function,dtype_cast.default,backward,8,1,1,1,5190,1,3 +5977,alias_default_1327,call_function,alias.default,backward,8,1,1,0,5191,0,2 +5978,alias_default_1110,call_function,alias.default,unknown,,1,1,3,5198,951,4 +5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5 +5980,permute_935,call_function,permute.default,backward,8,1,1,1,4,947,3 +5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5 +5982,permute_936,call_function,permute.default,backward,8,1,1,1,5200,2,4 +5983,dtype_cast_432,call_function,dtype_cast.default,backward,8,1,1,1,5201,1,4 +5984,alias_default_1322,call_function,alias.default,backward,8,1,1,0,5202,0,3 +5985,view_1074,call_function,view.default,backward,8,1,1,1,5201,945,4 +5986,permute_937,call_function,permute.default,backward,8,1,1,1,5202,944,4 +5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2 +5988,getitem_309,call_function,getitem,backward,8,1,1,1,5207,916,2 +5989,getitem_310,call_function,getitem,backward,8,1,1,1,5207,917,2 +5990,getitem_311,call_function,getitem,backward,8,1,1,1,5207,910,2 +5991,permute_938,call_function,permute.default,backward,8,1,1,1,5208,909,2 +5992,permute_939,call_function,permute.default,backward,8,1,1,1,5208,916,2 +5993,permute_940,call_function,permute.default,backward,8,1,1,1,5208,915,2 +5994,convert_element_type_1564,call_function,convert_element_type.default,backward,8,1,1,1,5209,915,2 +5995,convert_element_type_1565,call_function,convert_element_type.default,backward,8,1,1,1,5209,914,2 +5996,view_1075,call_function,view.default,backward,8,1,1,1,5210,914,2 +5997,view_as_complex_94,call_function,view_as_complex.default,backward,8,1,1,1,5211,913,6 +5998,_conj_38,call_function,_conj.default,backward,8,1,1,1,4,914,3 +5999,clone_158,call_function,clone.default,backward,8,1,1,1,5,913,3 +6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8 +6001,view_1076,call_function,view.default,backward,8,1,1,1,5210,913,2 +6002,view_as_complex_95,call_function,view_as_complex.default,backward,8,1,1,1,5211,912,6 +6003,_conj_39,call_function,_conj.default,backward,8,1,1,1,4,913,3 +6004,clone_159,call_function,clone.default,backward,8,1,1,1,5,912,3 +6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8 +6006,view_as_real_94,call_function,view_as_real.default,backward,8,1,1,1,5215,911,6 +6007,view_1077,call_function,view.default,backward,8,1,1,1,5216,910,6 +6008,convert_element_type_1566,call_function,convert_element_type.default,backward,8,1,1,1,5217,909,6 +6009,view_as_real_95,call_function,view_as_real.default,backward,8,1,1,1,5215,910,6 +6010,view_1078,call_function,view.default,backward,8,1,1,1,5216,909,6 +6011,convert_element_type_1567,call_function,convert_element_type.default,backward,8,1,1,1,5217,908,6 +6012,view_1079,call_function,view.default,backward,8,1,1,1,5209,908,2 +6013,view_1080,call_function,view.default,backward,8,1,1,1,5218,908,5 +6014,view_1081,call_function,view.default,backward,8,1,1,1,5218,907,5 +6015,alias_default_1111,call_function,alias.default,backward,8,1,1,2,5210,907,4 +6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5 +6017,permute_943,call_function,permute.default,backward,8,1,1,1,4,903,3 +6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5 +6019,permute_944,call_function,permute.default,backward,8,1,1,1,5212,2,4 +6020,dtype_cast_433,call_function,dtype_cast.default,backward,8,1,1,1,5213,1,4 +6021,alias_default_1321,call_function,alias.default,backward,8,1,1,0,5214,0,3 +6022,alias_default_1112,call_function,alias.default,backward,8,1,1,2,5219,907,4 +6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5 +6024,permute_947,call_function,permute.default,backward,8,1,1,1,4,903,3 +6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5 +6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10 +6027,permute_948,call_function,permute.default,backward,8,1,1,1,5221,2,4 +6028,dtype_cast_434,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4 +6029,alias_default_1320,call_function,alias.default,backward,8,1,1,0,5223,0,3 +6030,alias_default_1113,call_function,alias.default,backward,8,1,1,2,5219,906,4 +6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5 +6032,permute_951,call_function,permute.default,backward,8,1,1,1,4,902,3 +6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5 +6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10 +6035,permute_952,call_function,permute.default,backward,8,1,1,1,5221,2,4 +6036,dtype_cast_435,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4 +6037,alias_default_1319,call_function,alias.default,backward,8,1,1,0,5223,0,3 +6038,convert_element_type_1580,call_function,convert_element_type.default,backward,8,1,1,1,5245,899,8 +6039,convert_element_type_1581,call_function,convert_element_type.default,backward,8,1,1,1,889,899,4 +6040,convert_element_type_1582,call_function,convert_element_type.default,backward,8,1,1,1,3,893,2 +6041,alias_default_1114,call_function,alias.default,backward,8,1,1,2,5246,898,4 +6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8 +6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8 +6044,alias_default_1115,call_function,alias.default,backward,8,1,1,2,5249,891,4 +6045,alias_default_1116,call_function,alias.default,backward,8,1,1,3,898,897,4 +6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8 +6047,sum_81,call_function,sum.dim_IntList,backward,8,1,1,1,5254,889,5 +6048,div_68,call_function,div.Tensor,backward,8,1,1,1,899,889,6 +6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8 +6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10 +6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8 +6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8 +6053,sum_82,call_function,sum.dim_IntList,backward,8,1,1,1,5251,3,5 +6054,convert_element_type_1583,call_function,convert_element_type.default,backward,8,1,1,1,5259,885,6 +6055,convert_element_type_1584,call_function,convert_element_type.default,backward,8,1,1,1,5252,2,3 +6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10 +6057,dtype_cast_436,call_function,dtype_cast.default,backward,8,1,1,1,5253,1,3 +6058,alias_default_1326,call_function,alias.default,backward,8,1,1,0,5254,0,2 +6059,alias_default_1117,call_function,alias.default,unknown,,1,1,3,5261,883,4 +6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5 +6061,permute_955,call_function,permute.default,backward,7,1,1,1,4,879,3 +6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5 +6063,permute_956,call_function,permute.default,backward,7,1,1,1,5263,2,4 +6064,dtype_cast_437,call_function,dtype_cast.default,backward,7,1,1,1,5264,1,4 +6065,alias_default_1315,call_function,alias.default,backward,7,1,1,0,5265,0,3 +6066,alias_default_1118,call_function,alias.default,backward,7,1,1,2,5264,877,4 +6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8 +6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8 +6069,alias_default_1119,call_function,alias.default,backward,7,1,1,2,5266,864,4 +6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5 +6071,permute_959,call_function,permute.default,backward,7,1,1,1,4,860,3 +6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5 +6073,permute_960,call_function,permute.default,backward,7,1,1,1,5268,2,4 +6074,dtype_cast_438,call_function,dtype_cast.default,backward,7,1,1,1,5269,1,4 +6075,alias_default_1316,call_function,alias.default,backward,7,1,1,0,5270,0,3 +6076,convert_element_type_1593,call_function,convert_element_type.default,backward,7,1,1,1,5266,868,6 +6077,convert_element_type_1594,call_function,convert_element_type.default,backward,7,1,1,1,866,878,4 +6078,alias_default_1120,call_function,alias.default,backward,7,1,1,2,867,877,4 +6079,neg_48,call_function,neg.default,backward,7,1,1,1,868,876,8 +6080,exp_48,call_function,exp.default,backward,7,1,1,1,869,875,6 +6081,add_281,call_function,add.Tensor,backward,7,1,1,1,870,874,4 +6082,reciprocal_20,call_function,reciprocal.default,backward,7,1,1,1,871,873,4 +6083,mul_606,call_function,mul.Tensor,backward,7,1,1,1,872,872,6 +6084,alias_default_1121,call_function,alias.default,backward,7,1,1,2,873,871,4 +6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8 +6086,sub_61,call_function,sub.Tensor,backward,7,1,1,1,874,869,4 +6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8 +6088,add_282,call_function,add.Tensor,backward,7,1,1,1,876,867,4 +6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8 +6090,convert_element_type_1595,call_function,convert_element_type.default,backward,7,1,1,1,5280,865,6 +6091,alias_default_1122,call_function,alias.default,backward,7,1,1,2,5281,864,4 +6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5 +6093,permute_963,call_function,permute.default,backward,7,1,1,1,4,860,3 +6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5 +6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10 +6096,permute_964,call_function,permute.default,backward,7,1,1,1,5283,2,4 +6097,dtype_cast_439,call_function,dtype_cast.default,backward,7,1,1,1,5284,1,4 +6098,alias_default_1314,call_function,alias.default,backward,7,1,1,0,5285,0,3 +6099,convert_element_type_1600,call_function,convert_element_type.default,backward,7,1,1,1,5289,857,8 +6100,convert_element_type_1601,call_function,convert_element_type.default,backward,7,1,1,1,846,857,4 +6101,convert_element_type_1602,call_function,convert_element_type.default,backward,7,1,1,1,3,851,2 +6102,alias_default_1123,call_function,alias.default,backward,7,1,1,2,5290,856,4 +6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8 +6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8 +6105,alias_default_1124,call_function,alias.default,backward,7,1,1,2,5293,849,4 +6106,alias_default_1125,call_function,alias.default,backward,7,1,1,3,855,855,4 +6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8 +6108,sum_83,call_function,sum.dim_IntList,backward,7,1,1,1,5298,847,5 +6109,div_69,call_function,div.Tensor,backward,7,1,1,1,856,847,6 +6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8 +6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10 +6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8 +6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8 +6114,sum_84,call_function,sum.dim_IntList,backward,7,1,1,1,5295,3,5 +6115,convert_element_type_1603,call_function,convert_element_type.default,backward,7,1,1,1,5303,843,6 +6116,convert_element_type_1604,call_function,convert_element_type.default,backward,7,1,1,1,5296,2,3 +6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10 +6118,dtype_cast_440,call_function,dtype_cast.default,backward,7,1,1,1,5297,1,3 +6119,alias_default_1318,call_function,alias.default,backward,7,1,1,0,5298,0,2 +6120,alias_default_1126,call_function,alias.default,unknown,,1,1,3,5305,841,4 +6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5 +6122,permute_967,call_function,permute.default,backward,7,1,1,1,4,837,3 +6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5 +6124,permute_968,call_function,permute.default,backward,7,1,1,1,5307,2,4 +6125,dtype_cast_441,call_function,dtype_cast.default,backward,7,1,1,1,5308,1,4 +6126,alias_default_1313,call_function,alias.default,backward,7,1,1,0,5309,0,3 +6127,view_1096,call_function,view.default,backward,7,1,1,1,5308,835,4 +6128,permute_969,call_function,permute.default,backward,7,1,1,1,5309,834,4 +6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2 +6130,getitem_312,call_function,getitem,backward,7,1,1,1,5314,806,2 +6131,getitem_313,call_function,getitem,backward,7,1,1,1,5314,807,2 +6132,getitem_314,call_function,getitem,backward,7,1,1,1,5314,800,2 +6133,permute_970,call_function,permute.default,backward,7,1,1,1,5315,799,2 +6134,permute_971,call_function,permute.default,backward,7,1,1,1,5315,806,2 +6135,permute_972,call_function,permute.default,backward,7,1,1,1,5315,805,2 +6136,convert_element_type_1609,call_function,convert_element_type.default,backward,7,1,1,1,5316,805,2 +6137,convert_element_type_1610,call_function,convert_element_type.default,backward,7,1,1,1,5316,804,2 +6138,view_1097,call_function,view.default,backward,7,1,1,1,5317,804,2 +6139,view_as_complex_96,call_function,view_as_complex.default,backward,7,1,1,1,5318,803,6 +6140,_conj_40,call_function,_conj.default,backward,7,1,1,1,4,804,3 +6141,clone_166,call_function,clone.default,backward,7,1,1,1,5,803,3 +6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8 +6143,view_1098,call_function,view.default,backward,7,1,1,1,5317,803,2 +6144,view_as_complex_97,call_function,view_as_complex.default,backward,7,1,1,1,5318,802,6 +6145,_conj_41,call_function,_conj.default,backward,7,1,1,1,4,803,3 +6146,clone_167,call_function,clone.default,backward,7,1,1,1,5,802,3 +6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8 +6148,view_as_real_96,call_function,view_as_real.default,backward,7,1,1,1,5322,801,6 +6149,view_1099,call_function,view.default,backward,7,1,1,1,5323,800,6 +6150,convert_element_type_1611,call_function,convert_element_type.default,backward,7,1,1,1,5324,799,6 +6151,view_as_real_97,call_function,view_as_real.default,backward,7,1,1,1,5322,800,6 +6152,view_1100,call_function,view.default,backward,7,1,1,1,5323,799,6 +6153,convert_element_type_1612,call_function,convert_element_type.default,backward,7,1,1,1,5324,798,6 +6154,view_1101,call_function,view.default,backward,7,1,1,1,5316,798,2 +6155,view_1102,call_function,view.default,backward,7,1,1,1,5325,798,5 +6156,view_1103,call_function,view.default,backward,7,1,1,1,5325,797,5 +6157,alias_default_1127,call_function,alias.default,backward,7,1,1,2,5317,797,4 +6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5 +6159,permute_975,call_function,permute.default,backward,7,1,1,1,4,793,3 +6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5 +6161,permute_976,call_function,permute.default,backward,7,1,1,1,5319,2,4 +6162,dtype_cast_442,call_function,dtype_cast.default,backward,7,1,1,1,5320,1,4 +6163,alias_default_1312,call_function,alias.default,backward,7,1,1,0,5321,0,3 +6164,alias_default_1128,call_function,alias.default,backward,7,1,1,2,5326,797,4 +6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5 +6166,permute_979,call_function,permute.default,backward,7,1,1,1,4,793,3 +6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5 +6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10 +6169,permute_980,call_function,permute.default,backward,7,1,1,1,5328,2,4 +6170,dtype_cast_443,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4 +6171,alias_default_1311,call_function,alias.default,backward,7,1,1,0,5330,0,3 +6172,alias_default_1129,call_function,alias.default,backward,7,1,1,2,5326,796,4 +6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5 +6174,permute_983,call_function,permute.default,backward,7,1,1,1,4,792,3 +6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5 +6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10 +6177,permute_984,call_function,permute.default,backward,7,1,1,1,5328,2,4 +6178,dtype_cast_444,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4 +6179,alias_default_1310,call_function,alias.default,backward,7,1,1,0,5330,0,3 +6180,convert_element_type_1625,call_function,convert_element_type.default,backward,7,1,1,1,5352,789,8 +6181,convert_element_type_1626,call_function,convert_element_type.default,backward,7,1,1,1,779,789,4 +6182,convert_element_type_1627,call_function,convert_element_type.default,backward,7,1,1,1,3,783,2 +6183,alias_default_1130,call_function,alias.default,backward,7,1,1,2,5353,788,4 +6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8 +6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8 +6186,alias_default_1131,call_function,alias.default,backward,7,1,1,2,5356,781,4 +6187,alias_default_1132,call_function,alias.default,backward,7,1,1,3,788,787,4 +6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8 +6189,sum_85,call_function,sum.dim_IntList,backward,7,1,1,1,5361,779,5 +6190,div_70,call_function,div.Tensor,backward,7,1,1,1,789,779,6 +6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8 +6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10 +6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8 +6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8 +6195,sum_86,call_function,sum.dim_IntList,backward,7,1,1,1,5358,3,5 +6196,convert_element_type_1628,call_function,convert_element_type.default,backward,7,1,1,1,5366,775,6 +6197,convert_element_type_1629,call_function,convert_element_type.default,backward,7,1,1,1,5359,2,3 +6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10 +6199,dtype_cast_445,call_function,dtype_cast.default,backward,7,1,1,1,5360,1,3 +6200,alias_default_1317,call_function,alias.default,backward,7,1,1,0,5361,0,2 +6201,alias_default_1133,call_function,alias.default,unknown,,1,1,3,5368,773,4 +6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5 +6203,permute_987,call_function,permute.default,backward,6,1,1,1,4,769,3 +6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5 +6205,permute_988,call_function,permute.default,backward,6,1,1,1,5370,2,4 +6206,dtype_cast_446,call_function,dtype_cast.default,backward,6,1,1,1,5371,1,4 +6207,alias_default_1306,call_function,alias.default,backward,6,1,1,0,5372,0,3 +6208,alias_default_1134,call_function,alias.default,backward,6,1,1,2,5371,767,4 +6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8 +6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8 +6211,alias_default_1135,call_function,alias.default,backward,6,1,1,2,5373,754,4 +6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5 +6213,permute_991,call_function,permute.default,backward,6,1,1,1,4,750,3 +6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5 +6215,permute_992,call_function,permute.default,backward,6,1,1,1,5375,2,4 +6216,dtype_cast_447,call_function,dtype_cast.default,backward,6,1,1,1,5376,1,4 +6217,alias_default_1307,call_function,alias.default,backward,6,1,1,0,5377,0,3 +6218,convert_element_type_1638,call_function,convert_element_type.default,backward,6,1,1,1,5373,758,6 +6219,convert_element_type_1639,call_function,convert_element_type.default,backward,6,1,1,1,756,768,4 +6220,alias_default_1136,call_function,alias.default,backward,6,1,1,2,757,767,4 +6221,neg_49,call_function,neg.default,backward,6,1,1,1,758,766,8 +6222,exp_49,call_function,exp.default,backward,6,1,1,1,759,765,6 +6223,add_288,call_function,add.Tensor,backward,6,1,1,1,760,764,4 +6224,reciprocal_21,call_function,reciprocal.default,backward,6,1,1,1,761,763,4 +6225,mul_626,call_function,mul.Tensor,backward,6,1,1,1,762,762,6 +6226,alias_default_1137,call_function,alias.default,backward,6,1,1,2,763,761,4 +6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8 +6228,sub_64,call_function,sub.Tensor,backward,6,1,1,1,764,759,4 +6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8 +6230,add_289,call_function,add.Tensor,backward,6,1,1,1,766,757,4 +6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8 +6232,convert_element_type_1640,call_function,convert_element_type.default,backward,6,1,1,1,5387,755,6 +6233,alias_default_1138,call_function,alias.default,backward,6,1,1,2,5388,754,4 +6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5 +6235,permute_995,call_function,permute.default,backward,6,1,1,1,4,750,3 +6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5 +6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10 +6238,permute_996,call_function,permute.default,backward,6,1,1,1,5390,2,4 +6239,dtype_cast_448,call_function,dtype_cast.default,backward,6,1,1,1,5391,1,4 +6240,alias_default_1305,call_function,alias.default,backward,6,1,1,0,5392,0,3 +6241,convert_element_type_1645,call_function,convert_element_type.default,backward,6,1,1,1,5396,747,8 +6242,convert_element_type_1646,call_function,convert_element_type.default,backward,6,1,1,1,736,747,4 +6243,convert_element_type_1647,call_function,convert_element_type.default,backward,6,1,1,1,3,741,2 +6244,alias_default_1139,call_function,alias.default,backward,6,1,1,2,5397,746,4 +6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8 +6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8 +6247,alias_default_1140,call_function,alias.default,backward,6,1,1,2,5400,739,4 +6248,alias_default_1141,call_function,alias.default,backward,6,1,1,3,745,745,4 +6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8 +6250,sum_87,call_function,sum.dim_IntList,backward,6,1,1,1,5405,737,5 +6251,div_71,call_function,div.Tensor,backward,6,1,1,1,746,737,6 +6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8 +6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10 +6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8 +6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8 +6256,sum_88,call_function,sum.dim_IntList,backward,6,1,1,1,5402,3,5 +6257,convert_element_type_1648,call_function,convert_element_type.default,backward,6,1,1,1,5410,733,6 +6258,convert_element_type_1649,call_function,convert_element_type.default,backward,6,1,1,1,5403,2,3 +6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10 +6260,dtype_cast_449,call_function,dtype_cast.default,backward,6,1,1,1,5404,1,3 +6261,alias_default_1309,call_function,alias.default,backward,6,1,1,0,5405,0,2 +6262,alias_default_1142,call_function,alias.default,unknown,,1,1,3,5412,731,4 +6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5 +6264,permute_999,call_function,permute.default,backward,6,1,1,1,4,727,3 +6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5 +6266,permute_1000,call_function,permute.default,backward,6,1,1,1,5414,2,4 +6267,dtype_cast_450,call_function,dtype_cast.default,backward,6,1,1,1,5415,1,4 +6268,alias_default_1304,call_function,alias.default,backward,6,1,1,0,5416,0,3 +6269,view_1118,call_function,view.default,backward,6,1,1,1,5415,725,4 +6270,permute_1001,call_function,permute.default,backward,6,1,1,1,5416,724,4 +6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2 +6272,getitem_315,call_function,getitem,backward,6,1,1,1,5421,696,2 +6273,getitem_316,call_function,getitem,backward,6,1,1,1,5421,697,2 +6274,getitem_317,call_function,getitem,backward,6,1,1,1,5421,690,2 +6275,permute_1002,call_function,permute.default,backward,6,1,1,1,5422,689,2 +6276,permute_1003,call_function,permute.default,backward,6,1,1,1,5422,696,2 +6277,permute_1004,call_function,permute.default,backward,6,1,1,1,5422,695,2 +6278,convert_element_type_1654,call_function,convert_element_type.default,backward,6,1,1,1,5423,695,2 +6279,convert_element_type_1655,call_function,convert_element_type.default,backward,6,1,1,1,5423,694,2 +6280,view_1119,call_function,view.default,backward,6,1,1,1,5424,694,2 +6281,view_as_complex_98,call_function,view_as_complex.default,backward,6,1,1,1,5425,693,6 +6282,_conj_42,call_function,_conj.default,backward,6,1,1,1,4,694,3 +6283,clone_174,call_function,clone.default,backward,6,1,1,1,5,693,3 +6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8 +6285,view_1120,call_function,view.default,backward,6,1,1,1,5424,693,2 +6286,view_as_complex_99,call_function,view_as_complex.default,backward,6,1,1,1,5425,692,6 +6287,_conj_43,call_function,_conj.default,backward,6,1,1,1,4,693,3 +6288,clone_175,call_function,clone.default,backward,6,1,1,1,5,692,3 +6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8 +6290,view_as_real_98,call_function,view_as_real.default,backward,6,1,1,1,5429,691,6 +6291,view_1121,call_function,view.default,backward,6,1,1,1,5430,690,6 +6292,convert_element_type_1656,call_function,convert_element_type.default,backward,6,1,1,1,5431,689,6 +6293,view_as_real_99,call_function,view_as_real.default,backward,6,1,1,1,5429,690,6 +6294,view_1122,call_function,view.default,backward,6,1,1,1,5430,689,6 +6295,convert_element_type_1657,call_function,convert_element_type.default,backward,6,1,1,1,5431,688,6 +6296,view_1123,call_function,view.default,backward,6,1,1,1,5423,688,2 +6297,view_1124,call_function,view.default,backward,6,1,1,1,5432,688,5 +6298,view_1125,call_function,view.default,backward,6,1,1,1,5432,687,5 +6299,alias_default_1143,call_function,alias.default,backward,6,1,1,2,5424,687,4 +6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5 +6301,permute_1007,call_function,permute.default,backward,6,1,1,1,4,683,3 +6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5 +6303,permute_1008,call_function,permute.default,backward,6,1,1,1,5426,2,4 +6304,dtype_cast_451,call_function,dtype_cast.default,backward,6,1,1,1,5427,1,4 +6305,alias_default_1303,call_function,alias.default,backward,6,1,1,0,5428,0,3 +6306,alias_default_1144,call_function,alias.default,backward,6,1,1,2,5433,687,4 +6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5 +6308,permute_1011,call_function,permute.default,backward,6,1,1,1,4,683,3 +6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5 +6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10 +6311,permute_1012,call_function,permute.default,backward,6,1,1,1,5435,2,4 +6312,dtype_cast_452,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4 +6313,alias_default_1302,call_function,alias.default,backward,6,1,1,0,5437,0,3 +6314,alias_default_1145,call_function,alias.default,backward,6,1,1,2,5433,686,4 +6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5 +6316,permute_1015,call_function,permute.default,backward,6,1,1,1,4,682,3 +6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5 +6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10 +6319,permute_1016,call_function,permute.default,backward,6,1,1,1,5435,2,4 +6320,dtype_cast_453,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4 +6321,alias_default_1301,call_function,alias.default,backward,6,1,1,0,5437,0,3 +6322,convert_element_type_1670,call_function,convert_element_type.default,backward,6,1,1,1,5459,679,8 +6323,convert_element_type_1671,call_function,convert_element_type.default,backward,6,1,1,1,669,679,4 +6324,convert_element_type_1672,call_function,convert_element_type.default,backward,6,1,1,1,3,673,2 +6325,alias_default_1146,call_function,alias.default,backward,6,1,1,2,5460,678,4 +6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8 +6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8 +6328,alias_default_1147,call_function,alias.default,backward,6,1,1,2,5463,671,4 +6329,alias_default_1148,call_function,alias.default,backward,6,1,1,3,678,677,4 +6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8 +6331,sum_89,call_function,sum.dim_IntList,backward,6,1,1,1,5468,669,5 +6332,div_72,call_function,div.Tensor,backward,6,1,1,1,679,669,6 +6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8 +6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10 +6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8 +6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8 +6337,sum_90,call_function,sum.dim_IntList,backward,6,1,1,1,5465,3,5 +6338,convert_element_type_1673,call_function,convert_element_type.default,backward,6,1,1,1,5473,665,6 +6339,convert_element_type_1674,call_function,convert_element_type.default,backward,6,1,1,1,5466,2,3 +6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10 +6341,dtype_cast_454,call_function,dtype_cast.default,backward,6,1,1,1,5467,1,3 +6342,alias_default_1308,call_function,alias.default,backward,6,1,1,0,5468,0,2 +6343,alias_default_1149,call_function,alias.default,unknown,,1,1,3,5475,663,4 +6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5 +6345,permute_1019,call_function,permute.default,backward,5,1,1,1,4,659,3 +6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5 +6347,permute_1020,call_function,permute.default,backward,5,1,1,1,5477,2,4 +6348,dtype_cast_455,call_function,dtype_cast.default,backward,5,1,1,1,5478,1,4 +6349,alias_default_1297,call_function,alias.default,backward,5,1,1,0,5479,0,3 +6350,alias_default_1150,call_function,alias.default,backward,5,1,1,2,5478,657,4 +6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8 +6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8 +6353,alias_default_1151,call_function,alias.default,backward,5,1,1,2,5480,644,4 +6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5 +6355,permute_1023,call_function,permute.default,backward,5,1,1,1,4,640,3 +6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5 +6357,permute_1024,call_function,permute.default,backward,5,1,1,1,5482,2,4 +6358,dtype_cast_456,call_function,dtype_cast.default,backward,5,1,1,1,5483,1,4 +6359,alias_default_1298,call_function,alias.default,backward,5,1,1,0,5484,0,3 +6360,convert_element_type_1683,call_function,convert_element_type.default,backward,5,1,1,1,5480,648,6 +6361,convert_element_type_1684,call_function,convert_element_type.default,backward,5,1,1,1,646,658,4 +6362,alias_default_1152,call_function,alias.default,backward,5,1,1,2,647,657,4 +6363,neg_50,call_function,neg.default,backward,5,1,1,1,648,656,8 +6364,exp_50,call_function,exp.default,backward,5,1,1,1,649,655,6 +6365,add_295,call_function,add.Tensor,backward,5,1,1,1,650,654,4 +6366,reciprocal_22,call_function,reciprocal.default,backward,5,1,1,1,651,653,4 +6367,mul_646,call_function,mul.Tensor,backward,5,1,1,1,652,652,6 +6368,alias_default_1153,call_function,alias.default,backward,5,1,1,2,653,651,4 +6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8 +6370,sub_67,call_function,sub.Tensor,backward,5,1,1,1,654,649,4 +6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8 +6372,add_296,call_function,add.Tensor,backward,5,1,1,1,656,647,4 +6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8 +6374,convert_element_type_1685,call_function,convert_element_type.default,backward,5,1,1,1,5494,645,6 +6375,alias_default_1154,call_function,alias.default,backward,5,1,1,2,5495,644,4 +6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5 +6377,permute_1027,call_function,permute.default,backward,5,1,1,1,4,640,3 +6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5 +6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10 +6380,permute_1028,call_function,permute.default,backward,5,1,1,1,5497,2,4 +6381,dtype_cast_457,call_function,dtype_cast.default,backward,5,1,1,1,5498,1,4 +6382,alias_default_1296,call_function,alias.default,backward,5,1,1,0,5499,0,3 +6383,convert_element_type_1690,call_function,convert_element_type.default,backward,5,1,1,1,5503,637,8 +6384,convert_element_type_1691,call_function,convert_element_type.default,backward,5,1,1,1,626,637,4 +6385,convert_element_type_1692,call_function,convert_element_type.default,backward,5,1,1,1,3,631,2 +6386,alias_default_1155,call_function,alias.default,backward,5,1,1,2,5504,636,4 +6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8 +6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8 +6389,alias_default_1156,call_function,alias.default,backward,5,1,1,2,5507,629,4 +6390,alias_default_1157,call_function,alias.default,backward,5,1,1,3,635,635,4 +6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8 +6392,sum_91,call_function,sum.dim_IntList,backward,5,1,1,1,5512,627,5 +6393,div_73,call_function,div.Tensor,backward,5,1,1,1,636,627,6 +6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8 +6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10 +6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8 +6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8 +6398,sum_92,call_function,sum.dim_IntList,backward,5,1,1,1,5509,3,5 +6399,convert_element_type_1693,call_function,convert_element_type.default,backward,5,1,1,1,5517,623,6 +6400,convert_element_type_1694,call_function,convert_element_type.default,backward,5,1,1,1,5510,2,3 +6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10 +6402,dtype_cast_458,call_function,dtype_cast.default,backward,5,1,1,1,5511,1,3 +6403,alias_default_1300,call_function,alias.default,backward,5,1,1,0,5512,0,2 +6404,alias_default_1158,call_function,alias.default,unknown,,1,1,3,5519,621,4 +6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5 +6406,permute_1031,call_function,permute.default,backward,5,1,1,1,4,617,3 +6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5 +6408,permute_1032,call_function,permute.default,backward,5,1,1,1,5521,2,4 +6409,dtype_cast_459,call_function,dtype_cast.default,backward,5,1,1,1,5522,1,4 +6410,alias_default_1295,call_function,alias.default,backward,5,1,1,0,5523,0,3 +6411,view_1140,call_function,view.default,backward,5,1,1,1,5522,615,4 +6412,permute_1033,call_function,permute.default,backward,5,1,1,1,5523,614,4 +6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2 +6414,getitem_318,call_function,getitem,backward,5,1,1,1,5528,586,2 +6415,getitem_319,call_function,getitem,backward,5,1,1,1,5528,587,2 +6416,getitem_320,call_function,getitem,backward,5,1,1,1,5528,580,2 +6417,permute_1034,call_function,permute.default,backward,5,1,1,1,5529,579,2 +6418,permute_1035,call_function,permute.default,backward,5,1,1,1,5529,586,2 +6419,permute_1036,call_function,permute.default,backward,5,1,1,1,5529,585,2 +6420,convert_element_type_1699,call_function,convert_element_type.default,backward,5,1,1,1,5530,585,2 +6421,convert_element_type_1700,call_function,convert_element_type.default,backward,5,1,1,1,5530,584,2 +6422,view_1141,call_function,view.default,backward,5,1,1,1,5531,584,2 +6423,view_as_complex_100,call_function,view_as_complex.default,backward,5,1,1,1,5532,583,6 +6424,_conj_44,call_function,_conj.default,backward,5,1,1,1,4,584,3 +6425,clone_182,call_function,clone.default,backward,5,1,1,1,5,583,3 +6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8 +6427,view_1142,call_function,view.default,backward,5,1,1,1,5531,583,2 +6428,view_as_complex_101,call_function,view_as_complex.default,backward,5,1,1,1,5532,582,6 +6429,_conj_45,call_function,_conj.default,backward,5,1,1,1,4,583,3 +6430,clone_183,call_function,clone.default,backward,5,1,1,1,5,582,3 +6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8 +6432,view_as_real_100,call_function,view_as_real.default,backward,5,1,1,1,5536,581,6 +6433,view_1143,call_function,view.default,backward,5,1,1,1,5537,580,6 +6434,convert_element_type_1701,call_function,convert_element_type.default,backward,5,1,1,1,5538,579,6 +6435,view_as_real_101,call_function,view_as_real.default,backward,5,1,1,1,5536,580,6 +6436,view_1144,call_function,view.default,backward,5,1,1,1,5537,579,6 +6437,convert_element_type_1702,call_function,convert_element_type.default,backward,5,1,1,1,5538,578,6 +6438,view_1145,call_function,view.default,backward,5,1,1,1,5530,578,2 +6439,view_1146,call_function,view.default,backward,5,1,1,1,5539,578,5 +6440,view_1147,call_function,view.default,backward,5,1,1,1,5539,577,5 +6441,alias_default_1159,call_function,alias.default,backward,5,1,1,2,5531,577,4 +6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5 +6443,permute_1039,call_function,permute.default,backward,5,1,1,1,4,573,3 +6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5 +6445,permute_1040,call_function,permute.default,backward,5,1,1,1,5533,2,4 +6446,dtype_cast_460,call_function,dtype_cast.default,backward,5,1,1,1,5534,1,4 +6447,alias_default_1294,call_function,alias.default,backward,5,1,1,0,5535,0,3 +6448,alias_default_1160,call_function,alias.default,backward,5,1,1,2,5540,577,4 +6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5 +6450,permute_1043,call_function,permute.default,backward,5,1,1,1,4,573,3 +6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5 +6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10 +6453,permute_1044,call_function,permute.default,backward,5,1,1,1,5542,2,4 +6454,dtype_cast_461,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4 +6455,alias_default_1293,call_function,alias.default,backward,5,1,1,0,5544,0,3 +6456,alias_default_1161,call_function,alias.default,backward,5,1,1,2,5540,576,4 +6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5 +6458,permute_1047,call_function,permute.default,backward,5,1,1,1,4,572,3 +6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5 +6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10 +6461,permute_1048,call_function,permute.default,backward,5,1,1,1,5542,2,4 +6462,dtype_cast_462,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4 +6463,alias_default_1292,call_function,alias.default,backward,5,1,1,0,5544,0,3 +6464,convert_element_type_1715,call_function,convert_element_type.default,backward,5,1,1,1,5566,569,8 +6465,convert_element_type_1716,call_function,convert_element_type.default,backward,5,1,1,1,559,569,4 +6466,convert_element_type_1717,call_function,convert_element_type.default,backward,5,1,1,1,3,563,2 +6467,alias_default_1162,call_function,alias.default,backward,5,1,1,2,5567,568,4 +6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8 +6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8 +6470,alias_default_1163,call_function,alias.default,backward,5,1,1,2,5570,561,4 +6471,alias_default_1164,call_function,alias.default,backward,5,1,1,3,568,567,4 +6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8 +6473,sum_93,call_function,sum.dim_IntList,backward,5,1,1,1,5575,559,5 +6474,div_74,call_function,div.Tensor,backward,5,1,1,1,569,559,6 +6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8 +6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10 +6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8 +6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8 +6479,sum_94,call_function,sum.dim_IntList,backward,5,1,1,1,5572,3,5 +6480,convert_element_type_1718,call_function,convert_element_type.default,backward,5,1,1,1,5580,555,6 +6481,convert_element_type_1719,call_function,convert_element_type.default,backward,5,1,1,1,5573,2,3 +6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10 +6483,dtype_cast_463,call_function,dtype_cast.default,backward,5,1,1,1,5574,1,3 +6484,alias_default_1299,call_function,alias.default,backward,5,1,1,0,5575,0,2 +6485,alias_default_1165,call_function,alias.default,unknown,,1,1,3,5582,553,4 +6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5 +6487,permute_1051,call_function,permute.default,backward,4,1,1,1,4,549,3 +6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5 +6489,permute_1052,call_function,permute.default,backward,4,1,1,1,5584,2,4 +6490,dtype_cast_464,call_function,dtype_cast.default,backward,4,1,1,1,5585,1,4 +6491,alias_default_1288,call_function,alias.default,backward,4,1,1,0,5586,0,3 +6492,alias_default_1166,call_function,alias.default,backward,4,1,1,2,5585,547,4 +6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8 +6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8 +6495,alias_default_1167,call_function,alias.default,backward,4,1,1,2,5587,534,4 +6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5 +6497,permute_1055,call_function,permute.default,backward,4,1,1,1,4,530,3 +6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5 +6499,permute_1056,call_function,permute.default,backward,4,1,1,1,5589,2,4 +6500,dtype_cast_465,call_function,dtype_cast.default,backward,4,1,1,1,5590,1,4 +6501,alias_default_1289,call_function,alias.default,backward,4,1,1,0,5591,0,3 +6502,convert_element_type_1728,call_function,convert_element_type.default,backward,4,1,1,1,5587,538,6 +6503,convert_element_type_1729,call_function,convert_element_type.default,backward,4,1,1,1,536,548,4 +6504,alias_default_1168,call_function,alias.default,backward,4,1,1,2,537,547,4 +6505,neg_51,call_function,neg.default,backward,4,1,1,1,538,546,8 +6506,exp_51,call_function,exp.default,backward,4,1,1,1,539,545,6 +6507,add_302,call_function,add.Tensor,backward,4,1,1,1,540,544,4 +6508,reciprocal_23,call_function,reciprocal.default,backward,4,1,1,1,541,543,4 +6509,mul_666,call_function,mul.Tensor,backward,4,1,1,1,542,542,6 +6510,alias_default_1169,call_function,alias.default,backward,4,1,1,2,543,541,4 +6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8 +6512,sub_70,call_function,sub.Tensor,backward,4,1,1,1,544,539,4 +6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8 +6514,add_303,call_function,add.Tensor,backward,4,1,1,1,546,537,4 +6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8 +6516,convert_element_type_1730,call_function,convert_element_type.default,backward,4,1,1,1,5601,535,6 +6517,alias_default_1170,call_function,alias.default,backward,4,1,1,2,5602,534,4 +6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5 +6519,permute_1059,call_function,permute.default,backward,4,1,1,1,4,530,3 +6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5 +6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10 +6522,permute_1060,call_function,permute.default,backward,4,1,1,1,5604,2,4 +6523,dtype_cast_466,call_function,dtype_cast.default,backward,4,1,1,1,5605,1,4 +6524,alias_default_1287,call_function,alias.default,backward,4,1,1,0,5606,0,3 +6525,convert_element_type_1735,call_function,convert_element_type.default,backward,4,1,1,1,5610,527,8 +6526,convert_element_type_1736,call_function,convert_element_type.default,backward,4,1,1,1,516,527,4 +6527,convert_element_type_1737,call_function,convert_element_type.default,backward,4,1,1,1,3,521,2 +6528,alias_default_1171,call_function,alias.default,backward,4,1,1,2,5611,526,4 +6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8 +6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8 +6531,alias_default_1172,call_function,alias.default,backward,4,1,1,2,5614,519,4 +6532,alias_default_1173,call_function,alias.default,backward,4,1,1,3,525,525,4 +6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8 +6534,sum_95,call_function,sum.dim_IntList,backward,4,1,1,1,5619,517,5 +6535,div_75,call_function,div.Tensor,backward,4,1,1,1,526,517,6 +6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8 +6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10 +6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8 +6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8 +6540,sum_96,call_function,sum.dim_IntList,backward,4,1,1,1,5616,3,5 +6541,convert_element_type_1738,call_function,convert_element_type.default,backward,4,1,1,1,5624,513,6 +6542,convert_element_type_1739,call_function,convert_element_type.default,backward,4,1,1,1,5617,2,3 +6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10 +6544,dtype_cast_467,call_function,dtype_cast.default,backward,4,1,1,1,5618,1,3 +6545,alias_default_1291,call_function,alias.default,backward,4,1,1,0,5619,0,2 +6546,alias_default_1174,call_function,alias.default,unknown,,1,1,3,5626,511,4 +6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5 +6548,permute_1063,call_function,permute.default,backward,4,1,1,1,4,507,3 +6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5 +6550,permute_1064,call_function,permute.default,backward,4,1,1,1,5628,2,4 +6551,dtype_cast_468,call_function,dtype_cast.default,backward,4,1,1,1,5629,1,4 +6552,alias_default_1286,call_function,alias.default,backward,4,1,1,0,5630,0,3 +6553,view_1162,call_function,view.default,backward,4,1,1,1,5629,505,4 +6554,permute_1065,call_function,permute.default,backward,4,1,1,1,5630,504,4 +6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2 +6556,getitem_321,call_function,getitem,backward,4,1,1,1,5635,476,2 +6557,getitem_322,call_function,getitem,backward,4,1,1,1,5635,477,2 +6558,getitem_323,call_function,getitem,backward,4,1,1,1,5635,470,2 +6559,permute_1066,call_function,permute.default,backward,4,1,1,1,5636,469,2 +6560,permute_1067,call_function,permute.default,backward,4,1,1,1,5636,476,2 +6561,permute_1068,call_function,permute.default,backward,4,1,1,1,5636,475,2 +6562,convert_element_type_1744,call_function,convert_element_type.default,backward,4,1,1,1,5637,475,2 +6563,convert_element_type_1745,call_function,convert_element_type.default,backward,4,1,1,1,5637,474,2 +6564,view_1163,call_function,view.default,backward,4,1,1,1,5638,474,2 +6565,view_as_complex_102,call_function,view_as_complex.default,backward,4,1,1,1,5639,473,6 +6566,_conj_46,call_function,_conj.default,backward,4,1,1,1,4,474,3 +6567,clone_190,call_function,clone.default,backward,4,1,1,1,5,473,3 +6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8 +6569,view_1164,call_function,view.default,backward,4,1,1,1,5638,473,2 +6570,view_as_complex_103,call_function,view_as_complex.default,backward,4,1,1,1,5639,472,6 +6571,_conj_47,call_function,_conj.default,backward,4,1,1,1,4,473,3 +6572,clone_191,call_function,clone.default,backward,4,1,1,1,5,472,3 +6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8 +6574,view_as_real_102,call_function,view_as_real.default,backward,4,1,1,1,5643,471,6 +6575,view_1165,call_function,view.default,backward,4,1,1,1,5644,470,6 +6576,convert_element_type_1746,call_function,convert_element_type.default,backward,4,1,1,1,5645,469,6 +6577,view_as_real_103,call_function,view_as_real.default,backward,4,1,1,1,5643,470,6 +6578,view_1166,call_function,view.default,backward,4,1,1,1,5644,469,6 +6579,convert_element_type_1747,call_function,convert_element_type.default,backward,4,1,1,1,5645,468,6 +6580,view_1167,call_function,view.default,backward,4,1,1,1,5637,468,2 +6581,view_1168,call_function,view.default,backward,4,1,1,1,5646,468,5 +6582,view_1169,call_function,view.default,backward,4,1,1,1,5646,467,5 +6583,alias_default_1175,call_function,alias.default,backward,4,1,1,2,5638,467,4 +6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5 +6585,permute_1071,call_function,permute.default,backward,4,1,1,1,4,463,3 +6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5 +6587,permute_1072,call_function,permute.default,backward,4,1,1,1,5640,2,4 +6588,dtype_cast_469,call_function,dtype_cast.default,backward,4,1,1,1,5641,1,4 +6589,alias_default_1285,call_function,alias.default,backward,4,1,1,0,5642,0,3 +6590,alias_default_1176,call_function,alias.default,backward,4,1,1,2,5647,467,4 +6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5 +6592,permute_1075,call_function,permute.default,backward,4,1,1,1,4,463,3 +6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5 +6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10 +6595,permute_1076,call_function,permute.default,backward,4,1,1,1,5649,2,4 +6596,dtype_cast_470,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4 +6597,alias_default_1284,call_function,alias.default,backward,4,1,1,0,5651,0,3 +6598,alias_default_1177,call_function,alias.default,backward,4,1,1,2,5647,466,4 +6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5 +6600,permute_1079,call_function,permute.default,backward,4,1,1,1,4,462,3 +6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5 +6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10 +6603,permute_1080,call_function,permute.default,backward,4,1,1,1,5649,2,4 +6604,dtype_cast_471,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4 +6605,alias_default_1283,call_function,alias.default,backward,4,1,1,0,5651,0,3 +6606,convert_element_type_1760,call_function,convert_element_type.default,backward,4,1,1,1,5673,459,8 +6607,convert_element_type_1761,call_function,convert_element_type.default,backward,4,1,1,1,449,459,4 +6608,convert_element_type_1762,call_function,convert_element_type.default,backward,4,1,1,1,3,453,2 +6609,alias_default_1178,call_function,alias.default,backward,4,1,1,2,5674,458,4 +6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8 +6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8 +6612,alias_default_1179,call_function,alias.default,backward,4,1,1,2,5677,451,4 +6613,alias_default_1180,call_function,alias.default,backward,4,1,1,3,458,457,4 +6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8 +6615,sum_97,call_function,sum.dim_IntList,backward,4,1,1,1,5682,449,5 +6616,div_76,call_function,div.Tensor,backward,4,1,1,1,459,449,6 +6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8 +6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10 +6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8 +6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8 +6621,sum_98,call_function,sum.dim_IntList,backward,4,1,1,1,5679,3,5 +6622,convert_element_type_1763,call_function,convert_element_type.default,backward,4,1,1,1,5687,445,6 +6623,convert_element_type_1764,call_function,convert_element_type.default,backward,4,1,1,1,5680,2,3 +6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10 +6625,dtype_cast_472,call_function,dtype_cast.default,backward,4,1,1,1,5681,1,3 +6626,alias_default_1290,call_function,alias.default,backward,4,1,1,0,5682,0,2 +6627,alias_default_1181,call_function,alias.default,unknown,,1,1,3,5689,443,4 +6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5 +6629,permute_1083,call_function,permute.default,backward,3,1,1,1,4,439,3 +6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5 +6631,permute_1084,call_function,permute.default,backward,3,1,1,1,5691,2,4 +6632,dtype_cast_473,call_function,dtype_cast.default,backward,3,1,1,1,5692,1,4 +6633,alias_default_1279,call_function,alias.default,backward,3,1,1,0,5693,0,3 +6634,alias_default_1182,call_function,alias.default,backward,3,1,1,2,5692,437,4 +6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8 +6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8 +6637,alias_default_1183,call_function,alias.default,backward,3,1,1,2,5694,424,4 +6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5 +6639,permute_1087,call_function,permute.default,backward,3,1,1,1,4,420,3 +6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5 +6641,permute_1088,call_function,permute.default,backward,3,1,1,1,5696,2,4 +6642,dtype_cast_474,call_function,dtype_cast.default,backward,3,1,1,1,5697,1,4 +6643,alias_default_1280,call_function,alias.default,backward,3,1,1,0,5698,0,3 +6644,convert_element_type_1773,call_function,convert_element_type.default,backward,3,1,1,1,5694,428,6 +6645,convert_element_type_1774,call_function,convert_element_type.default,backward,3,1,1,1,426,438,4 +6646,alias_default_1184,call_function,alias.default,backward,3,1,1,2,427,437,4 +6647,neg_52,call_function,neg.default,backward,3,1,1,1,428,436,8 +6648,exp_52,call_function,exp.default,backward,3,1,1,1,429,435,6 +6649,add_309,call_function,add.Tensor,backward,3,1,1,1,430,434,4 +6650,reciprocal_24,call_function,reciprocal.default,backward,3,1,1,1,431,433,4 +6651,mul_686,call_function,mul.Tensor,backward,3,1,1,1,432,432,6 +6652,alias_default_1185,call_function,alias.default,backward,3,1,1,2,433,431,4 +6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8 +6654,sub_73,call_function,sub.Tensor,backward,3,1,1,1,434,429,4 +6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8 +6656,add_310,call_function,add.Tensor,backward,3,1,1,1,436,427,4 +6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8 +6658,convert_element_type_1775,call_function,convert_element_type.default,backward,3,1,1,1,5708,425,6 +6659,alias_default_1186,call_function,alias.default,backward,3,1,1,2,5709,424,4 +6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5 +6661,permute_1091,call_function,permute.default,backward,3,1,1,1,4,420,3 +6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5 +6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10 +6664,permute_1092,call_function,permute.default,backward,3,1,1,1,5711,2,4 +6665,dtype_cast_475,call_function,dtype_cast.default,backward,3,1,1,1,5712,1,4 +6666,alias_default_1278,call_function,alias.default,backward,3,1,1,0,5713,0,3 +6667,convert_element_type_1780,call_function,convert_element_type.default,backward,3,1,1,1,5717,417,8 +6668,convert_element_type_1781,call_function,convert_element_type.default,backward,3,1,1,1,406,417,4 +6669,convert_element_type_1782,call_function,convert_element_type.default,backward,3,1,1,1,3,411,2 +6670,alias_default_1187,call_function,alias.default,backward,3,1,1,2,5718,416,4 +6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8 +6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8 +6673,alias_default_1188,call_function,alias.default,backward,3,1,1,2,5721,409,4 +6674,alias_default_1189,call_function,alias.default,backward,3,1,1,3,415,415,4 +6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8 +6676,sum_99,call_function,sum.dim_IntList,backward,3,1,1,1,5726,407,5 +6677,div_77,call_function,div.Tensor,backward,3,1,1,1,416,407,6 +6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8 +6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10 +6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8 +6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8 +6682,sum_100,call_function,sum.dim_IntList,backward,3,1,1,1,5723,3,5 +6683,convert_element_type_1783,call_function,convert_element_type.default,backward,3,1,1,1,5731,403,6 +6684,convert_element_type_1784,call_function,convert_element_type.default,backward,3,1,1,1,5724,2,3 +6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10 +6686,dtype_cast_476,call_function,dtype_cast.default,backward,3,1,1,1,5725,1,3 +6687,alias_default_1282,call_function,alias.default,backward,3,1,1,0,5726,0,2 +6688,alias_default_1190,call_function,alias.default,unknown,,1,1,3,5733,401,4 +6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5 +6690,permute_1095,call_function,permute.default,backward,3,1,1,1,4,397,3 +6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5 +6692,permute_1096,call_function,permute.default,backward,3,1,1,1,5735,2,4 +6693,dtype_cast_477,call_function,dtype_cast.default,backward,3,1,1,1,5736,1,4 +6694,alias_default_1277,call_function,alias.default,backward,3,1,1,0,5737,0,3 +6695,view_1184,call_function,view.default,backward,3,1,1,1,5736,395,4 +6696,permute_1097,call_function,permute.default,backward,3,1,1,1,5737,394,4 +6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2 +6698,getitem_324,call_function,getitem,backward,3,1,1,1,5742,366,2 +6699,getitem_325,call_function,getitem,backward,3,1,1,1,5742,367,2 +6700,getitem_326,call_function,getitem,backward,3,1,1,1,5742,360,2 +6701,permute_1098,call_function,permute.default,backward,3,1,1,1,5743,359,2 +6702,permute_1099,call_function,permute.default,backward,3,1,1,1,5743,366,2 +6703,permute_1100,call_function,permute.default,backward,3,1,1,1,5743,365,2 +6704,convert_element_type_1789,call_function,convert_element_type.default,backward,3,1,1,1,5744,365,2 +6705,convert_element_type_1790,call_function,convert_element_type.default,backward,3,1,1,1,5744,364,2 +6706,view_1185,call_function,view.default,backward,3,1,1,1,5745,364,2 +6707,view_as_complex_104,call_function,view_as_complex.default,backward,3,1,1,1,5746,363,6 +6708,_conj_48,call_function,_conj.default,backward,3,1,1,1,4,364,3 +6709,clone_198,call_function,clone.default,backward,3,1,1,1,5,363,3 +6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8 +6711,view_1186,call_function,view.default,backward,3,1,1,1,5745,363,2 +6712,view_as_complex_105,call_function,view_as_complex.default,backward,3,1,1,1,5746,362,6 +6713,_conj_49,call_function,_conj.default,backward,3,1,1,1,4,363,3 +6714,clone_199,call_function,clone.default,backward,3,1,1,1,5,362,3 +6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8 +6716,view_as_real_104,call_function,view_as_real.default,backward,3,1,1,1,5750,361,6 +6717,view_1187,call_function,view.default,backward,3,1,1,1,5751,360,6 +6718,convert_element_type_1791,call_function,convert_element_type.default,backward,3,1,1,1,5752,359,6 +6719,view_as_real_105,call_function,view_as_real.default,backward,3,1,1,1,5750,360,6 +6720,view_1188,call_function,view.default,backward,3,1,1,1,5751,359,6 +6721,convert_element_type_1792,call_function,convert_element_type.default,backward,3,1,1,1,5752,358,6 +6722,view_1189,call_function,view.default,backward,3,1,1,1,5744,358,2 +6723,view_1190,call_function,view.default,backward,3,1,1,1,5753,358,5 +6724,view_1191,call_function,view.default,backward,3,1,1,1,5753,357,5 +6725,alias_default_1191,call_function,alias.default,backward,3,1,1,2,5745,357,4 +6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5 +6727,permute_1103,call_function,permute.default,backward,3,1,1,1,4,353,3 +6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5 +6729,permute_1104,call_function,permute.default,backward,3,1,1,1,5747,2,4 +6730,dtype_cast_478,call_function,dtype_cast.default,backward,3,1,1,1,5748,1,4 +6731,alias_default_1276,call_function,alias.default,backward,3,1,1,0,5749,0,3 +6732,alias_default_1192,call_function,alias.default,backward,3,1,1,2,5754,357,4 +6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5 +6734,permute_1107,call_function,permute.default,backward,3,1,1,1,4,353,3 +6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5 +6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10 +6737,permute_1108,call_function,permute.default,backward,3,1,1,1,5756,2,4 +6738,dtype_cast_479,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4 +6739,alias_default_1275,call_function,alias.default,backward,3,1,1,0,5758,0,3 +6740,alias_default_1193,call_function,alias.default,backward,3,1,1,2,5754,356,4 +6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5 +6742,permute_1111,call_function,permute.default,backward,3,1,1,1,4,352,3 +6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5 +6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10 +6745,permute_1112,call_function,permute.default,backward,3,1,1,1,5756,2,4 +6746,dtype_cast_480,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4 +6747,alias_default_1274,call_function,alias.default,backward,3,1,1,0,5758,0,3 +6748,convert_element_type_1805,call_function,convert_element_type.default,backward,3,1,1,1,5780,349,8 +6749,convert_element_type_1806,call_function,convert_element_type.default,backward,3,1,1,1,339,349,4 +6750,convert_element_type_1807,call_function,convert_element_type.default,backward,3,1,1,1,3,343,2 +6751,alias_default_1194,call_function,alias.default,backward,3,1,1,2,5781,348,4 +6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8 +6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8 +6754,alias_default_1195,call_function,alias.default,backward,3,1,1,2,5784,341,4 +6755,alias_default_1196,call_function,alias.default,backward,3,1,1,3,348,347,4 +6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8 +6757,sum_101,call_function,sum.dim_IntList,backward,3,1,1,1,5789,339,5 +6758,div_78,call_function,div.Tensor,backward,3,1,1,1,349,339,6 +6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8 +6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10 +6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8 +6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8 +6763,sum_102,call_function,sum.dim_IntList,backward,3,1,1,1,5786,3,5 +6764,convert_element_type_1808,call_function,convert_element_type.default,backward,3,1,1,1,5794,335,6 +6765,convert_element_type_1809,call_function,convert_element_type.default,backward,3,1,1,1,5787,2,3 +6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10 +6767,dtype_cast_481,call_function,dtype_cast.default,backward,3,1,1,1,5788,1,3 +6768,alias_default_1281,call_function,alias.default,backward,3,1,1,0,5789,0,2 +6769,alias_default_1197,call_function,alias.default,unknown,,1,1,3,5796,333,4 +6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5 +6771,permute_1115,call_function,permute.default,backward,2,1,1,1,4,329,3 +6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5 +6773,permute_1116,call_function,permute.default,backward,2,1,1,1,5798,2,4 +6774,dtype_cast_482,call_function,dtype_cast.default,backward,2,1,1,1,5799,1,4 +6775,alias_default_1270,call_function,alias.default,backward,2,1,1,0,5800,0,3 +6776,alias_default_1198,call_function,alias.default,backward,2,1,1,2,5799,327,4 +6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8 +6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8 +6779,alias_default_1199,call_function,alias.default,backward,2,1,1,2,5801,314,4 +6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5 +6781,permute_1119,call_function,permute.default,backward,2,1,1,1,4,310,3 +6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5 +6783,permute_1120,call_function,permute.default,backward,2,1,1,1,5803,2,4 +6784,dtype_cast_483,call_function,dtype_cast.default,backward,2,1,1,1,5804,1,4 +6785,alias_default_1271,call_function,alias.default,backward,2,1,1,0,5805,0,3 +6786,convert_element_type_1818,call_function,convert_element_type.default,backward,2,1,1,1,5801,318,6 +6787,convert_element_type_1819,call_function,convert_element_type.default,backward,2,1,1,1,316,328,4 +6788,alias_default_1200,call_function,alias.default,backward,2,1,1,2,317,327,4 +6789,neg_53,call_function,neg.default,backward,2,1,1,1,318,326,8 +6790,exp_53,call_function,exp.default,backward,2,1,1,1,319,325,6 +6791,add_316,call_function,add.Tensor,backward,2,1,1,1,320,324,4 +6792,reciprocal_25,call_function,reciprocal.default,backward,2,1,1,1,321,323,4 +6793,mul_706,call_function,mul.Tensor,backward,2,1,1,1,322,322,6 +6794,alias_default_1201,call_function,alias.default,backward,2,1,1,2,323,321,4 +6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8 +6796,sub_76,call_function,sub.Tensor,backward,2,1,1,1,324,319,4 +6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8 +6798,add_317,call_function,add.Tensor,backward,2,1,1,1,326,317,4 +6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8 +6800,convert_element_type_1820,call_function,convert_element_type.default,backward,2,1,1,1,5815,315,6 +6801,alias_default_1202,call_function,alias.default,backward,2,1,1,2,5816,314,4 +6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5 +6803,permute_1123,call_function,permute.default,backward,2,1,1,1,4,310,3 +6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5 +6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10 +6806,permute_1124,call_function,permute.default,backward,2,1,1,1,5818,2,4 +6807,dtype_cast_484,call_function,dtype_cast.default,backward,2,1,1,1,5819,1,4 +6808,alias_default_1269,call_function,alias.default,backward,2,1,1,0,5820,0,3 +6809,convert_element_type_1825,call_function,convert_element_type.default,backward,2,1,1,1,5824,307,8 +6810,convert_element_type_1826,call_function,convert_element_type.default,backward,2,1,1,1,296,307,4 +6811,convert_element_type_1827,call_function,convert_element_type.default,backward,2,1,1,1,3,301,2 +6812,alias_default_1203,call_function,alias.default,backward,2,1,1,2,5825,306,4 +6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8 +6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8 +6815,alias_default_1204,call_function,alias.default,backward,2,1,1,2,5828,299,4 +6816,alias_default_1205,call_function,alias.default,backward,2,1,1,3,305,305,4 +6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8 +6818,sum_103,call_function,sum.dim_IntList,backward,2,1,1,1,5833,297,5 +6819,div_79,call_function,div.Tensor,backward,2,1,1,1,306,297,6 +6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8 +6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10 +6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8 +6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8 +6824,sum_104,call_function,sum.dim_IntList,backward,2,1,1,1,5830,3,5 +6825,convert_element_type_1828,call_function,convert_element_type.default,backward,2,1,1,1,5838,293,6 +6826,convert_element_type_1829,call_function,convert_element_type.default,backward,2,1,1,1,5831,2,3 +6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10 +6828,dtype_cast_485,call_function,dtype_cast.default,backward,2,1,1,1,5832,1,3 +6829,alias_default_1273,call_function,alias.default,backward,2,1,1,0,5833,0,2 +6830,alias_default_1206,call_function,alias.default,unknown,,1,1,3,5840,291,4 +6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5 +6832,permute_1127,call_function,permute.default,backward,2,1,1,1,4,287,3 +6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5 +6834,permute_1128,call_function,permute.default,backward,2,1,1,1,5842,2,4 +6835,dtype_cast_486,call_function,dtype_cast.default,backward,2,1,1,1,5843,1,4 +6836,alias_default_1268,call_function,alias.default,backward,2,1,1,0,5844,0,3 +6837,view_1206,call_function,view.default,backward,2,1,1,1,5843,285,4 +6838,permute_1129,call_function,permute.default,backward,2,1,1,1,5844,284,4 +6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2 +6840,getitem_327,call_function,getitem,backward,2,1,1,1,5849,256,2 +6841,getitem_328,call_function,getitem,backward,2,1,1,1,5849,257,2 +6842,getitem_329,call_function,getitem,backward,2,1,1,1,5849,250,2 +6843,permute_1130,call_function,permute.default,backward,2,1,1,1,5850,249,2 +6844,permute_1131,call_function,permute.default,backward,2,1,1,1,5850,256,2 +6845,permute_1132,call_function,permute.default,backward,2,1,1,1,5850,255,2 +6846,convert_element_type_1834,call_function,convert_element_type.default,backward,2,1,1,1,5851,255,2 +6847,convert_element_type_1835,call_function,convert_element_type.default,backward,2,1,1,1,5851,254,2 +6848,view_1207,call_function,view.default,backward,2,1,1,1,5852,254,2 +6849,view_as_complex_106,call_function,view_as_complex.default,backward,2,1,1,1,5853,253,6 +6850,_conj_50,call_function,_conj.default,backward,2,1,1,1,4,254,3 +6851,clone_206,call_function,clone.default,backward,2,1,1,1,5,253,3 +6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8 +6853,view_1208,call_function,view.default,backward,2,1,1,1,5852,253,2 +6854,view_as_complex_107,call_function,view_as_complex.default,backward,2,1,1,1,5853,252,6 +6855,_conj_51,call_function,_conj.default,backward,2,1,1,1,4,253,3 +6856,clone_207,call_function,clone.default,backward,2,1,1,1,5,252,3 +6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8 +6858,view_as_real_106,call_function,view_as_real.default,backward,2,1,1,1,5857,251,6 +6859,view_1209,call_function,view.default,backward,2,1,1,1,5858,250,6 +6860,convert_element_type_1836,call_function,convert_element_type.default,backward,2,1,1,1,5859,249,6 +6861,view_as_real_107,call_function,view_as_real.default,backward,2,1,1,1,5857,250,6 +6862,view_1210,call_function,view.default,backward,2,1,1,1,5858,249,6 +6863,convert_element_type_1837,call_function,convert_element_type.default,backward,2,1,1,1,5859,248,6 +6864,view_1211,call_function,view.default,backward,2,1,1,1,5851,248,2 +6865,view_1212,call_function,view.default,backward,2,1,1,1,5860,248,5 +6866,view_1213,call_function,view.default,backward,2,1,1,1,5860,247,5 +6867,alias_default_1207,call_function,alias.default,backward,2,1,1,2,5852,247,4 +6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5 +6869,permute_1135,call_function,permute.default,backward,2,1,1,1,4,243,3 +6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5 +6871,permute_1136,call_function,permute.default,backward,2,1,1,1,5854,2,4 +6872,dtype_cast_487,call_function,dtype_cast.default,backward,2,1,1,1,5855,1,4 +6873,alias_default_1267,call_function,alias.default,backward,2,1,1,0,5856,0,3 +6874,alias_default_1208,call_function,alias.default,backward,2,1,1,2,5861,247,4 +6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5 +6876,permute_1139,call_function,permute.default,backward,2,1,1,1,4,243,3 +6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5 +6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10 +6879,permute_1140,call_function,permute.default,backward,2,1,1,1,5863,2,4 +6880,dtype_cast_488,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4 +6881,alias_default_1266,call_function,alias.default,backward,2,1,1,0,5865,0,3 +6882,alias_default_1209,call_function,alias.default,backward,2,1,1,2,5861,246,4 +6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5 +6884,permute_1143,call_function,permute.default,backward,2,1,1,1,4,242,3 +6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5 +6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10 +6887,permute_1144,call_function,permute.default,backward,2,1,1,1,5863,2,4 +6888,dtype_cast_489,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4 +6889,alias_default_1265,call_function,alias.default,backward,2,1,1,0,5865,0,3 +6890,convert_element_type_1850,call_function,convert_element_type.default,backward,2,1,1,1,5887,239,8 +6891,convert_element_type_1851,call_function,convert_element_type.default,backward,2,1,1,1,229,239,4 +6892,convert_element_type_1852,call_function,convert_element_type.default,backward,2,1,1,1,3,233,2 +6893,alias_default_1210,call_function,alias.default,backward,2,1,1,2,5888,238,4 +6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8 +6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8 +6896,alias_default_1211,call_function,alias.default,backward,2,1,1,2,5891,231,4 +6897,alias_default_1212,call_function,alias.default,backward,2,1,1,3,238,237,4 +6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8 +6899,sum_105,call_function,sum.dim_IntList,backward,2,1,1,1,5896,229,5 +6900,div_80,call_function,div.Tensor,backward,2,1,1,1,239,229,6 +6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8 +6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10 +6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8 +6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8 +6905,sum_106,call_function,sum.dim_IntList,backward,2,1,1,1,5893,3,5 +6906,convert_element_type_1853,call_function,convert_element_type.default,backward,2,1,1,1,5901,225,6 +6907,convert_element_type_1854,call_function,convert_element_type.default,backward,2,1,1,1,5894,2,3 +6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10 +6909,dtype_cast_490,call_function,dtype_cast.default,backward,2,1,1,1,5895,1,3 +6910,alias_default_1272,call_function,alias.default,backward,2,1,1,0,5896,0,2 +6911,alias_default_1213,call_function,alias.default,unknown,,1,1,3,5903,223,4 +6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5 +6913,permute_1147,call_function,permute.default,backward,1,1,1,1,4,219,3 +6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5 +6915,permute_1148,call_function,permute.default,backward,1,1,1,1,5905,2,4 +6916,dtype_cast_491,call_function,dtype_cast.default,backward,1,1,1,1,5906,1,4 +6917,alias_default_1261,call_function,alias.default,backward,1,1,1,0,5907,0,3 +6918,alias_default_1214,call_function,alias.default,backward,1,1,1,2,5906,217,4 +6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8 +6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8 +6921,alias_default_1215,call_function,alias.default,backward,1,1,1,2,5908,204,4 +6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5 +6923,permute_1151,call_function,permute.default,backward,1,1,1,1,4,200,3 +6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5 +6925,permute_1152,call_function,permute.default,backward,1,1,1,1,5910,2,4 +6926,dtype_cast_492,call_function,dtype_cast.default,backward,1,1,1,1,5911,1,4 +6927,alias_default_1262,call_function,alias.default,backward,1,1,1,0,5912,0,3 +6928,convert_element_type_1863,call_function,convert_element_type.default,backward,1,1,1,1,5908,208,6 +6929,convert_element_type_1864,call_function,convert_element_type.default,backward,1,1,1,1,206,218,4 +6930,alias_default_1216,call_function,alias.default,backward,1,1,1,2,207,217,4 +6931,neg_54,call_function,neg.default,backward,1,1,1,1,208,216,8 +6932,exp_54,call_function,exp.default,backward,1,1,1,1,209,215,6 +6933,add_323,call_function,add.Tensor,backward,1,1,1,1,210,214,4 +6934,reciprocal_26,call_function,reciprocal.default,backward,1,1,1,1,211,213,4 +6935,mul_726,call_function,mul.Tensor,backward,1,1,1,1,212,212,6 +6936,alias_default_1217,call_function,alias.default,backward,1,1,1,2,213,211,4 +6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8 +6938,sub_79,call_function,sub.Tensor,backward,1,1,1,1,214,209,4 +6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8 +6940,add_324,call_function,add.Tensor,backward,1,1,1,1,216,207,4 +6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8 +6942,convert_element_type_1865,call_function,convert_element_type.default,backward,1,1,1,1,5922,205,6 +6943,alias_default_1218,call_function,alias.default,backward,1,1,1,2,5923,204,4 +6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5 +6945,permute_1155,call_function,permute.default,backward,1,1,1,1,4,200,3 +6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5 +6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10 +6948,permute_1156,call_function,permute.default,backward,1,1,1,1,5925,2,4 +6949,dtype_cast_493,call_function,dtype_cast.default,backward,1,1,1,1,5926,1,4 +6950,alias_default_1260,call_function,alias.default,backward,1,1,1,0,5927,0,3 +6951,convert_element_type_1870,call_function,convert_element_type.default,backward,1,1,1,1,5931,197,8 +6952,convert_element_type_1871,call_function,convert_element_type.default,backward,1,1,1,1,186,197,4 +6953,convert_element_type_1872,call_function,convert_element_type.default,backward,1,1,1,1,3,191,2 +6954,alias_default_1219,call_function,alias.default,backward,1,1,1,2,5932,196,4 +6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8 +6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8 +6957,alias_default_1220,call_function,alias.default,backward,1,1,1,2,5935,189,4 +6958,alias_default_1221,call_function,alias.default,backward,1,1,1,3,195,195,4 +6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8 +6960,sum_107,call_function,sum.dim_IntList,backward,1,1,1,1,5940,187,5 +6961,div_81,call_function,div.Tensor,backward,1,1,1,1,196,187,6 +6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8 +6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10 +6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8 +6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8 +6966,sum_108,call_function,sum.dim_IntList,backward,1,1,1,1,5937,3,5 +6967,convert_element_type_1873,call_function,convert_element_type.default,backward,1,1,1,1,5945,183,6 +6968,convert_element_type_1874,call_function,convert_element_type.default,backward,1,1,1,1,5938,2,3 +6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10 +6970,dtype_cast_494,call_function,dtype_cast.default,backward,1,1,1,1,5939,1,3 +6971,alias_default_1264,call_function,alias.default,backward,1,1,1,0,5940,0,2 +6972,alias_default_1222,call_function,alias.default,unknown,,1,1,3,5947,181,4 +6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5 +6974,permute_1159,call_function,permute.default,backward,1,1,1,1,4,177,3 +6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5 +6976,permute_1160,call_function,permute.default,backward,1,1,1,1,5949,2,4 +6977,dtype_cast_495,call_function,dtype_cast.default,backward,1,1,1,1,5950,1,4 +6978,alias_default_1259,call_function,alias.default,backward,1,1,1,0,5951,0,3 +6979,view_1228,call_function,view.default,backward,1,1,1,1,5950,175,4 +6980,permute_1161,call_function,permute.default,backward,1,1,1,1,5951,174,4 +6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2 +6982,getitem_330,call_function,getitem,backward,1,1,1,1,5956,146,2 +6983,getitem_331,call_function,getitem,backward,1,1,1,1,5956,147,2 +6984,getitem_332,call_function,getitem,backward,1,1,1,1,5956,140,2 +6985,permute_1162,call_function,permute.default,backward,1,1,1,1,5957,139,2 +6986,permute_1163,call_function,permute.default,backward,1,1,1,1,5957,146,2 +6987,permute_1164,call_function,permute.default,backward,1,1,1,1,5957,145,2 +6988,convert_element_type_1879,call_function,convert_element_type.default,backward,1,1,1,1,5958,145,2 +6989,convert_element_type_1880,call_function,convert_element_type.default,backward,1,1,1,1,5958,144,2 +6990,view_1229,call_function,view.default,backward,1,1,1,1,5959,144,2 +6991,view_as_complex_108,call_function,view_as_complex.default,backward,1,1,1,1,5960,143,6 +6992,_conj_52,call_function,_conj.default,backward,1,1,1,1,4,144,3 +6993,clone_214,call_function,clone.default,backward,1,1,1,1,5,143,3 +6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8 +6995,view_1230,call_function,view.default,backward,1,1,1,1,5959,143,2 +6996,view_as_complex_109,call_function,view_as_complex.default,backward,1,1,1,1,5960,142,6 +6997,_conj_53,call_function,_conj.default,backward,1,1,1,1,4,143,3 +6998,clone_215,call_function,clone.default,backward,1,1,1,1,5,142,3 +6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8 +7000,view_as_real_108,call_function,view_as_real.default,backward,1,1,1,1,5964,141,6 +7001,view_1231,call_function,view.default,backward,1,1,1,1,5965,140,6 +7002,convert_element_type_1881,call_function,convert_element_type.default,backward,1,1,1,1,5966,139,6 +7003,view_as_real_109,call_function,view_as_real.default,backward,1,1,1,1,5964,140,6 +7004,view_1232,call_function,view.default,backward,1,1,1,1,5965,139,6 +7005,convert_element_type_1882,call_function,convert_element_type.default,backward,1,1,1,1,5966,138,6 +7006,view_1233,call_function,view.default,backward,1,1,1,1,5958,138,2 +7007,view_1234,call_function,view.default,backward,1,1,1,1,5967,138,5 +7008,view_1235,call_function,view.default,backward,1,1,1,1,5967,137,5 +7009,alias_default_1223,call_function,alias.default,backward,1,1,1,2,5959,137,4 +7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5 +7011,permute_1167,call_function,permute.default,backward,1,1,1,1,4,133,3 +7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5 +7013,permute_1168,call_function,permute.default,backward,1,1,1,1,5961,2,4 +7014,dtype_cast_496,call_function,dtype_cast.default,backward,1,1,1,1,5962,1,4 +7015,alias_default_1258,call_function,alias.default,backward,1,1,1,0,5963,0,3 +7016,alias_default_1224,call_function,alias.default,backward,1,1,1,2,5968,137,4 +7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5 +7018,permute_1171,call_function,permute.default,backward,1,1,1,1,4,133,3 +7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5 +7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10 +7021,permute_1172,call_function,permute.default,backward,1,1,1,1,5970,2,4 +7022,dtype_cast_497,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4 +7023,alias_default_1257,call_function,alias.default,backward,1,1,1,0,5972,0,3 +7024,alias_default_1225,call_function,alias.default,backward,1,1,1,2,5968,136,4 +7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5 +7026,permute_1175,call_function,permute.default,backward,1,1,1,1,4,132,3 +7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5 +7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10 +7029,permute_1176,call_function,permute.default,backward,1,1,1,1,5970,2,4 +7030,dtype_cast_498,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4 +7031,alias_default_1256,call_function,alias.default,backward,1,1,1,0,5972,0,3 +7032,convert_element_type_1895,call_function,convert_element_type.default,backward,1,1,1,1,5994,129,8 +7033,convert_element_type_1896,call_function,convert_element_type.default,backward,1,1,1,1,119,129,4 +7034,convert_element_type_1897,call_function,convert_element_type.default,backward,1,1,1,1,3,123,2 +7035,alias_default_1226,call_function,alias.default,backward,1,1,1,2,5995,128,4 +7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8 +7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8 +7038,alias_default_1227,call_function,alias.default,backward,1,1,1,2,5998,121,4 +7039,alias_default_1228,call_function,alias.default,backward,1,1,1,3,128,127,4 +7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8 +7041,sum_109,call_function,sum.dim_IntList,backward,1,1,1,1,6003,119,5 +7042,div_82,call_function,div.Tensor,backward,1,1,1,1,129,119,6 +7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8 +7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10 +7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8 +7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8 +7047,sum_110,call_function,sum.dim_IntList,backward,1,1,1,1,6000,3,5 +7048,convert_element_type_1898,call_function,convert_element_type.default,backward,1,1,1,1,6008,115,6 +7049,convert_element_type_1899,call_function,convert_element_type.default,backward,1,1,1,1,6001,2,3 +7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10 +7051,dtype_cast_499,call_function,dtype_cast.default,backward,1,1,1,1,6002,1,3 +7052,alias_default_1263,call_function,alias.default,backward,1,1,1,0,6003,0,2 +7053,alias_default_1229,call_function,alias.default,unknown,,1,1,3,6010,113,4 +7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5 +7055,permute_1179,call_function,permute.default,backward,0,1,1,1,4,109,3 +7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5 +7057,permute_1180,call_function,permute.default,backward,0,1,1,1,6012,2,4 +7058,dtype_cast_500,call_function,dtype_cast.default,backward,0,1,1,1,6013,1,4 +7059,alias_default_1252,call_function,alias.default,backward,0,1,1,0,6014,0,3 +7060,alias_default_1230,call_function,alias.default,backward,0,1,1,2,6013,107,4 +7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8 +7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8 +7063,alias_default_1231,call_function,alias.default,backward,0,1,1,2,6015,94,4 +7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5 +7065,permute_1183,call_function,permute.default,backward,0,1,1,1,4,90,3 +7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5 +7067,permute_1184,call_function,permute.default,backward,0,1,1,1,6017,2,4 +7068,dtype_cast_501,call_function,dtype_cast.default,backward,0,1,1,1,6018,1,4 +7069,alias_default_1253,call_function,alias.default,backward,0,1,1,0,6019,0,3 +7070,convert_element_type_1908,call_function,convert_element_type.default,backward,0,1,1,1,6015,98,6 +7071,convert_element_type_1909,call_function,convert_element_type.default,backward,0,1,1,1,96,108,4 +7072,alias_default_1232,call_function,alias.default,backward,0,1,1,2,97,107,4 +7073,neg_55,call_function,neg.default,backward,0,1,1,1,98,106,8 +7074,exp_55,call_function,exp.default,backward,0,1,1,1,99,105,6 +7075,add_330,call_function,add.Tensor,backward,0,1,1,1,100,104,4 +7076,reciprocal_27,call_function,reciprocal.default,backward,0,1,1,1,101,103,4 +7077,mul_746,call_function,mul.Tensor,backward,0,1,1,1,102,102,6 +7078,alias_default_1233,call_function,alias.default,backward,0,1,1,2,103,101,4 +7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8 +7080,sub_82,call_function,sub.Tensor,backward,0,1,1,1,104,99,4 +7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8 +7082,add_331,call_function,add.Tensor,backward,0,1,1,1,106,97,4 +7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8 +7084,convert_element_type_1910,call_function,convert_element_type.default,backward,0,1,1,1,6029,95,6 +7085,alias_default_1234,call_function,alias.default,backward,0,1,1,2,6030,94,4 +7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5 +7087,permute_1187,call_function,permute.default,backward,0,1,1,1,4,90,3 +7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5 +7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10 +7090,permute_1188,call_function,permute.default,backward,0,1,1,1,6032,2,4 +7091,dtype_cast_502,call_function,dtype_cast.default,backward,0,1,1,1,6033,1,4 +7092,alias_default_1251,call_function,alias.default,backward,0,1,1,0,6034,0,3 +7093,convert_element_type_1915,call_function,convert_element_type.default,backward,0,1,1,1,6038,87,8 +7094,convert_element_type_1916,call_function,convert_element_type.default,backward,0,1,1,1,76,87,4 +7095,convert_element_type_1917,call_function,convert_element_type.default,backward,0,1,1,1,3,81,2 +7096,alias_default_1235,call_function,alias.default,backward,0,1,1,2,6039,86,4 +7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8 +7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8 +7099,alias_default_1236,call_function,alias.default,backward,0,1,1,2,6042,79,4 +7100,alias_default_1237,call_function,alias.default,backward,0,1,1,3,85,85,4 +7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8 +7102,sum_111,call_function,sum.dim_IntList,backward,0,1,1,1,6047,77,5 +7103,div_83,call_function,div.Tensor,backward,0,1,1,1,86,77,6 +7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8 +7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10 +7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8 +7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8 +7108,sum_112,call_function,sum.dim_IntList,backward,0,1,1,1,6044,3,5 +7109,convert_element_type_1918,call_function,convert_element_type.default,backward,0,1,1,1,6052,73,6 +7110,convert_element_type_1919,call_function,convert_element_type.default,backward,0,1,1,1,6045,2,3 +7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10 +7112,dtype_cast_503,call_function,dtype_cast.default,backward,0,1,1,1,6046,1,3 +7113,alias_default_1255,call_function,alias.default,backward,0,1,1,0,6047,0,2 +7114,alias_default_1238,call_function,alias.default,unknown,,1,1,3,6054,71,4 +7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5 +7116,permute_1191,call_function,permute.default,backward,0,1,1,1,4,67,3 +7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5 +7118,permute_1192,call_function,permute.default,backward,0,1,1,1,6056,2,4 +7119,dtype_cast_504,call_function,dtype_cast.default,backward,0,1,1,1,6057,1,4 +7120,alias_default_1250,call_function,alias.default,backward,0,1,1,0,6058,0,3 +7121,view_1250,call_function,view.default,backward,0,1,1,1,6057,65,4 +7122,permute_1193,call_function,permute.default,backward,0,1,1,1,6058,64,4 +7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2 +7124,getitem_333,call_function,getitem,backward,0,1,1,1,6063,36,2 +7125,getitem_334,call_function,getitem,backward,0,1,1,1,6063,37,2 +7126,getitem_335,call_function,getitem,backward,0,1,1,1,6063,30,2 +7127,permute_1194,call_function,permute.default,backward,0,1,1,1,6064,29,2 +7128,permute_1195,call_function,permute.default,backward,0,1,1,1,6064,36,2 +7129,permute_1196,call_function,permute.default,backward,0,1,1,1,6064,35,2 +7130,convert_element_type_1924,call_function,convert_element_type.default,backward,0,1,1,1,6065,35,2 +7131,convert_element_type_1925,call_function,convert_element_type.default,backward,0,1,1,1,6065,34,2 +7132,view_1251,call_function,view.default,backward,0,1,1,1,6066,34,2 +7133,view_as_complex_110,call_function,view_as_complex.default,backward,0,1,1,1,6067,33,6 +7134,_conj_54,call_function,_conj.default,backward,0,1,1,1,4,34,3 +7135,clone_222,call_function,clone.default,backward,0,1,1,1,5,33,3 +7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8 +7137,view_1252,call_function,view.default,backward,0,1,1,1,6066,33,2 +7138,view_as_complex_111,call_function,view_as_complex.default,backward,0,1,1,1,6067,32,6 +7139,_conj_55,call_function,_conj.default,backward,0,1,1,1,4,33,3 +7140,clone_223,call_function,clone.default,backward,0,1,1,1,5,32,3 +7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8 +7142,view_as_real_110,call_function,view_as_real.default,backward,0,1,1,1,6071,31,6 +7143,view_1253,call_function,view.default,backward,0,1,1,1,6072,30,6 +7144,convert_element_type_1926,call_function,convert_element_type.default,backward,0,1,1,1,6073,29,6 +7145,view_as_real_111,call_function,view_as_real.default,backward,0,1,1,1,6071,30,6 +7146,view_1254,call_function,view.default,backward,0,1,1,1,6072,29,6 +7147,convert_element_type_1927,call_function,convert_element_type.default,backward,0,1,1,1,6073,28,6 +7148,view_1255,call_function,view.default,backward,0,1,1,1,6065,28,2 +7149,view_1256,call_function,view.default,backward,0,1,1,1,6074,28,5 +7150,view_1257,call_function,view.default,backward,0,1,1,1,6074,27,5 +7151,alias_default_1239,call_function,alias.default,backward,0,1,1,2,6066,27,4 +7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5 +7153,permute_1199,call_function,permute.default,backward,0,1,1,1,4,23,3 +7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5 +7155,permute_1200,call_function,permute.default,backward,0,1,1,1,6068,2,4 +7156,dtype_cast_505,call_function,dtype_cast.default,backward,0,1,1,1,6069,1,4 +7157,alias_default_1249,call_function,alias.default,backward,0,1,1,0,6070,0,3 +7158,alias_default_1240,call_function,alias.default,backward,0,1,1,2,6075,27,4 +7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5 +7160,permute_1203,call_function,permute.default,backward,0,1,1,1,4,23,3 +7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5 +7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10 +7163,permute_1204,call_function,permute.default,backward,0,1,1,1,6077,2,4 +7164,dtype_cast_506,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4 +7165,alias_default_1248,call_function,alias.default,backward,0,1,1,0,6079,0,3 +7166,alias_default_1241,call_function,alias.default,backward,0,1,1,2,6075,26,4 +7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5 +7168,permute_1207,call_function,permute.default,backward,0,1,1,1,4,22,3 +7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5 +7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10 +7171,permute_1208,call_function,permute.default,backward,0,1,1,1,6077,2,4 +7172,dtype_cast_507,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4 +7173,alias_default_1247,call_function,alias.default,backward,0,1,1,0,6079,0,3 +7174,convert_element_type_1940,call_function,convert_element_type.default,backward,0,1,1,1,6101,19,8 +7175,convert_element_type_1941,call_function,convert_element_type.default,backward,0,1,1,1,7,19,4 +7176,convert_element_type_1942,call_function,convert_element_type.default,backward,0,1,1,1,3,13,2 +7177,alias_default_1242,call_function,alias.default,backward,0,1,1,2,6102,18,4 +7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8 +7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8 +7180,alias_default_1243,call_function,alias.default,backward,0,1,1,2,6105,11,4 +7181,alias_default_1244,call_function,alias.default,backward,0,1,1,3,16,17,4 +7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8 +7183,sum_113,call_function,sum.dim_IntList,backward,0,1,1,1,6110,9,5 +7184,div_84,call_function,div.Tensor,backward,0,1,1,1,17,9,6 +7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8 +7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10 +7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8 +7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8 +7189,sum_114,call_function,sum.dim_IntList,backward,0,1,1,1,6107,3,5 +7190,convert_element_type_1943,call_function,convert_element_type.default,backward,0,1,1,1,6115,5,6 +7191,convert_element_type_1944,call_function,convert_element_type.default,backward,0,1,1,1,6108,2,3 +7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10 +7193,dtype_cast_508,call_function,dtype_cast.default,backward,0,1,1,1,6109,1,3 +7194,alias_default_1254,call_function,alias.default,backward,0,1,1,0,6110,0,2 +7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5 +7196,dtype_cast_509,call_function,dtype_cast.default,backward,,1,1,1,6118,2,3 +7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9 +7198,alias_default_1246,call_function,alias.default,unknown,,1,1,0,6127,0,3 diff --git a/profile_results/real_llama3_3b_dag_summary.json b/profile_results/real_llama3_3b_dag_summary.json new file mode 100644 index 00000000..93434ea9 --- /dev/null +++ b/profile_results/real_llama3_3b_dag_summary.json @@ -0,0 +1,883 @@ +{ + "branch_points": 1301, + "dag_edges": 8805, + "direct_dependency_histogram": { + "0": 257, + "1": 5275, + "2": 1611, + "3": 28, + "8": 28 + }, + "direct_offspring_histogram": { + "0": 255, + "1": 5643, + "2": 934, + "3": 254, + "4": 84, + "6": 28, + "28": 1 + }, + "ilp_nodes": 7199, + "max_ancestor_count": 6127, + "max_descendant_count": 5943, + "max_direct_dependency_nodes": 8, + "max_direct_offspring_nodes": 28, + "merge_points": 1667, + "merge_points_csv": "profile_results/real_llama3_3b_merge_points.csv", + "mesh": "1D 64", + "model": "LLaMA3 3B", + "node_stats_csv": "profile_results/real_llama3_3b_dag_node_stats.csv", + "top_fanout_points": [ + { + "ancestor_count": 1, + "descendant_count": 5942, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 28, + "idx": 296, + "layer": "", + "name": "alias_default_1", + "op": "call_function", + "phase": "unknown", + "strategy_count": 3, + "target": "alias.default" + }, + { + "ancestor_count": 20, + "descendant_count": 5788, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 276, + "layer": 0, + "name": "alias_default_8", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 132, + "descendant_count": 5692, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 381, + "layer": 1, + "name": "alias_default_36", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 242, + "descendant_count": 5596, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 485, + "layer": 2, + "name": "alias_default_64", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 352, + "descendant_count": 5500, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 589, + "layer": 3, + "name": "alias_default_92", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 462, + "descendant_count": 5404, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 693, + "layer": 4, + "name": "alias_default_120", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 572, + "descendant_count": 5308, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 797, + "layer": 5, + "name": "alias_default_148", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 682, + "descendant_count": 5212, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 901, + "layer": 6, + "name": "alias_default_176", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 792, + "descendant_count": 5116, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1005, + "layer": 7, + "name": "alias_default_204", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 902, + "descendant_count": 5020, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1109, + "layer": 8, + "name": "alias_default_232", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1012, + "descendant_count": 4924, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1213, + "layer": 9, + "name": "alias_default_260", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1122, + "descendant_count": 4828, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1317, + "layer": 10, + "name": "alias_default_288", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1232, + "descendant_count": 4732, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1421, + "layer": 11, + "name": "alias_default_316", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1342, + "descendant_count": 4636, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1525, + "layer": 12, + "name": "alias_default_344", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1452, + "descendant_count": 4540, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1629, + "layer": 13, + "name": "alias_default_372", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1562, + "descendant_count": 4444, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1733, + "layer": 14, + "name": "alias_default_400", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1672, + "descendant_count": 4348, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1837, + "layer": 15, + "name": "alias_default_428", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1782, + "descendant_count": 4252, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 1941, + "layer": 16, + "name": "alias_default_456", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 1892, + "descendant_count": 4156, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2045, + "layer": 17, + "name": "alias_default_484", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2002, + "descendant_count": 4060, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2149, + "layer": 18, + "name": "alias_default_512", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2112, + "descendant_count": 3964, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2253, + "layer": 19, + "name": "alias_default_540", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2222, + "descendant_count": 3868, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2357, + "layer": 20, + "name": "alias_default_568", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2332, + "descendant_count": 3772, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2461, + "layer": 21, + "name": "alias_default_596", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2442, + "descendant_count": 3676, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2565, + "layer": 22, + "name": "alias_default_624", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2552, + "descendant_count": 3580, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2669, + "layer": 23, + "name": "alias_default_652", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2662, + "descendant_count": 3484, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2773, + "layer": 24, + "name": "alias_default_680", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2772, + "descendant_count": 3388, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2877, + "layer": 25, + "name": "alias_default_708", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2882, + "descendant_count": 3292, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 2981, + "layer": 26, + "name": "alias_default_736", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 2992, + "descendant_count": 3196, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 6, + "idx": 3085, + "layer": 27, + "name": "alias_default_764", + "op": "call_function", + "phase": "forward", + "strategy_count": 4, + "target": "alias.default" + }, + { + "ancestor_count": 3, + "descendant_count": 5778, + "direct_dependency_args": 1, + "direct_dependency_nodes": 1, + "direct_offspring_nodes": 4, + "idx": 298, + "layer": 0, + "name": "alias_default_12", + "op": "call_function", + "phase": "forward", + "strategy_count": 3, + "target": "alias.default" + } + ], + "top_merge_points": [ + { + "ancestor_count": 3173, + "descendant_count": 3033, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 3289, + "layer": 27, + "name": "_scaled_dot_product_flash_attention_backward", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3280, + "descendant_count": 2923, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 3431, + "layer": 26, + "name": "_scaled_dot_product_flash_attention_backward_1", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3387, + "descendant_count": 2813, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 3573, + "layer": 25, + "name": "_scaled_dot_product_flash_attention_backward_2", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3494, + "descendant_count": 2703, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 3715, + "layer": 24, + "name": "_scaled_dot_product_flash_attention_backward_3", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3601, + "descendant_count": 2593, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 3857, + "layer": 23, + "name": "_scaled_dot_product_flash_attention_backward_4", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3708, + "descendant_count": 2483, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 3999, + "layer": 22, + "name": "_scaled_dot_product_flash_attention_backward_5", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3815, + "descendant_count": 2373, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4141, + "layer": 21, + "name": "_scaled_dot_product_flash_attention_backward_6", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 3922, + "descendant_count": 2263, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4283, + "layer": 20, + "name": "_scaled_dot_product_flash_attention_backward_7", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4029, + "descendant_count": 2153, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4425, + "layer": 19, + "name": "_scaled_dot_product_flash_attention_backward_8", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4136, + "descendant_count": 2043, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4567, + "layer": 18, + "name": "_scaled_dot_product_flash_attention_backward_9", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4243, + "descendant_count": 1933, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4709, + "layer": 17, + "name": "_scaled_dot_product_flash_attention_backward_10", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4350, + "descendant_count": 1823, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4851, + "layer": 16, + "name": "_scaled_dot_product_flash_attention_backward_11", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4457, + "descendant_count": 1713, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 4993, + "layer": 15, + "name": "_scaled_dot_product_flash_attention_backward_12", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4564, + "descendant_count": 1603, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5135, + "layer": 14, + "name": "_scaled_dot_product_flash_attention_backward_13", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4671, + "descendant_count": 1493, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5277, + "layer": 13, + "name": "_scaled_dot_product_flash_attention_backward_14", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4778, + "descendant_count": 1383, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5419, + "layer": 12, + "name": "_scaled_dot_product_flash_attention_backward_15", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4885, + "descendant_count": 1273, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5561, + "layer": 11, + "name": "_scaled_dot_product_flash_attention_backward_16", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 4992, + "descendant_count": 1163, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5703, + "layer": 10, + "name": "_scaled_dot_product_flash_attention_backward_17", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5099, + "descendant_count": 1053, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5845, + "layer": 9, + "name": "_scaled_dot_product_flash_attention_backward_18", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5206, + "descendant_count": 943, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 5987, + "layer": 8, + "name": "_scaled_dot_product_flash_attention_backward_19", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5313, + "descendant_count": 833, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6129, + "layer": 7, + "name": "_scaled_dot_product_flash_attention_backward_20", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5420, + "descendant_count": 723, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6271, + "layer": 6, + "name": "_scaled_dot_product_flash_attention_backward_21", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5527, + "descendant_count": 613, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6413, + "layer": 5, + "name": "_scaled_dot_product_flash_attention_backward_22", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5634, + "descendant_count": 503, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6555, + "layer": 4, + "name": "_scaled_dot_product_flash_attention_backward_23", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5741, + "descendant_count": 393, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6697, + "layer": 3, + "name": "_scaled_dot_product_flash_attention_backward_24", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5848, + "descendant_count": 283, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6839, + "layer": 2, + "name": "_scaled_dot_product_flash_attention_backward_25", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 5955, + "descendant_count": 173, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 6981, + "layer": 1, + "name": "_scaled_dot_product_flash_attention_backward_26", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 6062, + "descendant_count": 63, + "direct_dependency_args": 8, + "direct_dependency_nodes": 8, + "direct_offspring_nodes": 3, + "idx": 7123, + "layer": 0, + "name": "_scaled_dot_product_flash_attention_backward_27", + "op": "call_function", + "phase": "backward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention_backward.default" + }, + { + "ancestor_count": 63, + "descendant_count": 5761, + "direct_dependency_args": 3, + "direct_dependency_nodes": 3, + "direct_offspring_nodes": 4, + "idx": 313, + "layer": 0, + "name": "_scaled_dot_product_flash_attention", + "op": "call_function", + "phase": "forward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention.default" + }, + { + "ancestor_count": 173, + "descendant_count": 5665, + "direct_dependency_args": 3, + "direct_dependency_nodes": 3, + "direct_offspring_nodes": 4, + "idx": 417, + "layer": 1, + "name": "_scaled_dot_product_flash_attention_1", + "op": "call_function", + "phase": "forward", + "strategy_count": 2, + "target": "_scaled_dot_product_flash_attention.default" + } + ], + "trace_and_optimizer_build_s": 38.44014171184972, + "treewidth_upper_bounds": { + "moralized_edges": 11200, + "moralized_min_degree": 10, + "moralized_min_fill": 8, + "undirected_edges": 8805, + "undirected_min_degree": 9, + "undirected_min_fill": 6 + } +} \ No newline at end of file diff --git a/profile_results/real_llama3_3b_merge_points.csv b/profile_results/real_llama3_3b_merge_points.csv new file mode 100644 index 00000000..4418765e --- /dev/null +++ b/profile_results/real_llama3_3b_merge_points.csv @@ -0,0 +1,1668 @@ +idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count +3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2 +3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2 +3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2 +3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2 +3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2 +3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2 +4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2 +4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2 +4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2 +4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2 +4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2 +4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2 +4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2 +5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2 +5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2 +5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2 +5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2 +5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2 +5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2 +5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2 +6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2 +6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2 +6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2 +6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2 +6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2 +6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2 +6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2 +7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2 +313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2 +417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2 +521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2 +625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2 +729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2 +833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2 +937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2 +1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2 +1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2 +1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2 +1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2 +1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2 +1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2 +1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2 +1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2 +1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2 +1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2 +2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2 +2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2 +2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2 +2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2 +2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2 +2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2 +2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2 +2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2 +2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2 +3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2 +3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2 +260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5 +270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8 +272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8 +278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5 +282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5 +299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 +302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 +286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5 +325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5 +326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10 +336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8 +338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8 +344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5 +351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6 +356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5 +359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8 +364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5 +365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10 +375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8 +377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8 +383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5 +387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5 +403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 +406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 +391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5 +429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5 +430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10 +440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8 +442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8 +448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5 +455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6 +460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5 +463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8 +468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5 +469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10 +479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8 +481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8 +487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5 +491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5 +507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 +510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 +495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5 +533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5 +534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10 +544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8 +546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8 +552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5 +559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6 +564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5 +567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8 +572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5 +573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10 +583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8 +585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8 +591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5 +595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5 +611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 +614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 +599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5 +637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5 +638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10 +648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8 +650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8 +656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5 +663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6 +668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5 +671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8 +676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5 +677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10 +687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8 +689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8 +695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5 +699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5 +715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 +718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 +703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5 +741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5 +742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10 +752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8 +754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8 +760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5 +767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6 +772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5 +775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8 +780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5 +781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10 +791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8 +793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8 +799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5 +803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5 +819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 +822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 +807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5 +845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5 +846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10 +856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8 +858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8 +864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5 +871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6 +876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5 +879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8 +884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5 +885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10 +895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8 +897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8 +903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5 +907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5 +923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 +926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 +911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5 +949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5 +950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10 +960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8 +962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8 +968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5 +975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6 +980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5 +983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8 +988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5 +989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10 +999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8 +1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8 +1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5 +1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5 +1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 +1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 +1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5 +1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5 +1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10 +1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8 +1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8 +1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5 +1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6 +1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5 +1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8 +1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5 +1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10 +1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8 +1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8 +1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5 +1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5 +1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 +1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 +1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5 +1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5 +1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10 +1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8 +1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8 +1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5 +1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6 +1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5 +1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8 +1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5 +1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10 +1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8 +1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8 +1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 +1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 +1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 +1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 +1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5 +1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5 +1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10 +1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8 +1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8 +1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5 +1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6 +1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5 +1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8 +1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5 +1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10 +1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8 +1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8 +1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 +1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 +1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 +1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 +1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5 +1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5 +1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10 +1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8 +1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8 +1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5 +1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6 +1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5 +1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8 +1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5 +1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10 +1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8 +1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8 +1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 +1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 +1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 +1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 +1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5 +1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5 +1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10 +1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8 +1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8 +1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5 +1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6 +1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5 +1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8 +1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5 +1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10 +1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8 +1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8 +1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 +1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 +1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 +1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 +1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5 +1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5 +1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10 +1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8 +1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8 +1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5 +1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6 +1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5 +1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8 +1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5 +1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10 +1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8 +1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8 +1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 +1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 +1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 +1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 +1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5 +1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5 +1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10 +1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8 +1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8 +1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5 +1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6 +1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5 +1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8 +1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5 +1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10 +1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8 +1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8 +1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 +1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 +1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 +1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 +1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5 +1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5 +1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10 +1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8 +1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8 +1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5 +1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6 +1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5 +1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8 +1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5 +1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10 +1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8 +1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8 +1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 +1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 +1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 +1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 +1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5 +1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5 +1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10 +1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8 +1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8 +1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5 +1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6 +1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5 +1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8 +1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5 +1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10 +1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8 +1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8 +1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 +1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 +1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 +1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 +1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5 +1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5 +1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10 +2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8 +2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8 +2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5 +2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6 +2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5 +2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8 +2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5 +2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10 +2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8 +2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8 +2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 +2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 +2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 +2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 +2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5 +2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5 +2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10 +2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8 +2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8 +2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5 +2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6 +2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5 +2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8 +2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5 +2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10 +2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8 +2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8 +2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 +2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 +2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 +2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 +2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5 +2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5 +2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10 +2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8 +2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8 +2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5 +2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6 +2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5 +2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8 +2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5 +2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10 +2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8 +2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8 +2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 +2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 +2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 +2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 +2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5 +2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5 +2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10 +2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8 +2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8 +2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5 +2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6 +2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5 +2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8 +2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5 +2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10 +2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8 +2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8 +2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 +2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 +2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 +2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 +2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5 +2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5 +2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10 +2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8 +2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8 +2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5 +2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6 +2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5 +2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8 +2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5 +2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10 +2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8 +2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8 +2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 +2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 +2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 +2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 +2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5 +2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5 +2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10 +2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8 +2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8 +2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5 +2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6 +2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5 +2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8 +2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5 +2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10 +2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8 +2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8 +2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 +2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 +2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 +2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 +2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5 +2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5 +2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10 +2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8 +2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8 +2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5 +2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6 +2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5 +2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8 +2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5 +2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10 +2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8 +2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8 +2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 +2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 +2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 +2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 +2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5 +2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5 +2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10 +2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8 +2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8 +2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5 +2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6 +2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5 +2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8 +2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5 +2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10 +2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8 +2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8 +2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 +2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 +2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 +2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 +2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5 +2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5 +2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10 +2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8 +2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8 +2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5 +2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6 +2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5 +2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8 +2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5 +2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10 +2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8 +2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8 +2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 +2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 +2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 +2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 +2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5 +2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5 +2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10 +2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8 +2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8 +2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5 +2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6 +2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5 +2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8 +2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5 +2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10 +2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8 +2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8 +2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 +2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 +3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 +3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 +2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5 +3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5 +3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10 +3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8 +3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8 +3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5 +3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6 +3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5 +3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8 +3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5 +3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10 +3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8 +3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8 +3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 +3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 +3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 +3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 +3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5 +3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5 +3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10 +3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8 +3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8 +3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5 +3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6 +3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5 +3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8 +3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5 +3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10 +3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5 +3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8 +3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8 +3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8 +3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8 +3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10 +3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8 +3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5 +3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8 +3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8 +3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8 +3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8 +3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8 +3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5 +3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5 +3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10 +3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8 +3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8 +3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8 +3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8 +3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10 +3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8 +3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10 +3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5 +3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8 +3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8 +3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5 +3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5 +3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10 +3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5 +3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10 +3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8 +3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8 +3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8 +3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8 +3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10 +3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8 +3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10 +3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5 +3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8 +3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8 +3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8 +3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8 +3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8 +3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5 +3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5 +3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10 +3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8 +3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8 +3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8 +3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8 +3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10 +3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8 +3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10 +3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5 +3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8 +3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8 +3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5 +3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5 +3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10 +3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5 +3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10 +3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8 +3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8 +3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8 +3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8 +3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10 +3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8 +3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10 +3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5 +3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8 +3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8 +3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8 +3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8 +3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8 +3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5 +3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5 +3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10 +3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8 +3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8 +3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8 +3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8 +3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10 +3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8 +3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10 +3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5 +3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8 +3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8 +3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5 +3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5 +3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10 +3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5 +3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10 +3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8 +3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8 +3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8 +3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8 +3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10 +3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8 +3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10 +3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5 +3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8 +3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8 +3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8 +3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8 +3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8 +3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5 +3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5 +3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10 +3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8 +3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8 +3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8 +3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8 +3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10 +3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8 +3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10 +3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5 +3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8 +3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8 +3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5 +3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5 +3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10 +3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5 +3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10 +3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8 +3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8 +3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8 +3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8 +3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10 +3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8 +3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10 +3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5 +3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8 +3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8 +3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8 +3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8 +3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8 +3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5 +3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5 +3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10 +3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8 +3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8 +3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8 +3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8 +3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10 +3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8 +3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10 +3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5 +3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8 +3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8 +3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5 +3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5 +3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10 +3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5 +3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10 +3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8 +3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8 +3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8 +3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8 +3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10 +3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8 +3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10 +3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5 +3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8 +3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8 +3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8 +3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8 +3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8 +3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5 +3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5 +3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10 +3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8 +3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8 +3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8 +3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8 +3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10 +3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8 +3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10 +3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5 +4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8 +4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8 +4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5 +4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5 +4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10 +4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5 +4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10 +4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8 +4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8 +4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8 +4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8 +4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10 +4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8 +4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10 +4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5 +4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8 +4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8 +4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8 +4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8 +4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8 +4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5 +4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5 +4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10 +4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8 +4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8 +4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8 +4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8 +4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10 +4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8 +4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10 +4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5 +4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8 +4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8 +4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5 +4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5 +4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10 +4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5 +4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10 +4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8 +4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8 +4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8 +4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8 +4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10 +4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8 +4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10 +4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5 +4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8 +4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8 +4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8 +4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8 +4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8 +4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5 +4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5 +4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10 +4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8 +4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8 +4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8 +4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8 +4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10 +4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8 +4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10 +4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5 +4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8 +4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8 +4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5 +4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5 +4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10 +4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5 +4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10 +4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8 +4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8 +4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8 +4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8 +4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10 +4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8 +4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10 +4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5 +4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8 +4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8 +4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8 +4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8 +4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8 +4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5 +4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5 +4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10 +4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8 +4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8 +4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8 +4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8 +4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10 +4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8 +4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10 +4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5 +4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8 +4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8 +4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5 +4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5 +4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10 +4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5 +4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10 +4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8 +4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8 +4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8 +4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8 +4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10 +4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8 +4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10 +4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5 +4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8 +4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8 +4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8 +4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8 +4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8 +4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5 +4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5 +4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10 +4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8 +4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8 +4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8 +4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8 +4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10 +4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8 +4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10 +4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5 +4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8 +4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8 +4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5 +4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5 +4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10 +4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5 +4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10 +4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8 +4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8 +4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8 +4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8 +4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10 +4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8 +4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10 +4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5 +4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8 +4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8 +4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8 +4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8 +4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8 +4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5 +4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5 +4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10 +4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8 +4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8 +4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8 +4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8 +4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10 +4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8 +4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10 +4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5 +4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8 +4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8 +4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5 +4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5 +4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10 +4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5 +4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10 +4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8 +4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8 +4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8 +4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8 +4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10 +4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8 +4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10 +4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5 +4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8 +4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8 +4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8 +4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8 +4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8 +4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5 +4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5 +4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10 +4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8 +4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8 +4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8 +4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8 +4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10 +4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8 +4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10 +4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5 +4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8 +4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8 +4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5 +4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5 +4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10 +4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5 +4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10 +4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8 +4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8 +4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8 +4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8 +4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10 +4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8 +4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10 +4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5 +4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8 +4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8 +4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8 +4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8 +4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8 +4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5 +4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5 +4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10 +4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8 +4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8 +4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8 +4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8 +4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10 +4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8 +4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10 +4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5 +5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8 +5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8 +5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5 +5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5 +5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10 +5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5 +5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10 +5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8 +5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8 +5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8 +5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8 +5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10 +5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8 +5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10 +5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5 +5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8 +5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8 +5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8 +5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8 +5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8 +5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5 +5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5 +5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10 +5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8 +5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8 +5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8 +5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8 +5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10 +5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8 +5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10 +5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5 +5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8 +5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8 +5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5 +5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5 +5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10 +5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5 +5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10 +5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8 +5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8 +5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8 +5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8 +5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10 +5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8 +5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10 +5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5 +5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8 +5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8 +5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8 +5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8 +5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8 +5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5 +5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5 +5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10 +5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8 +5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8 +5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8 +5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8 +5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10 +5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8 +5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10 +5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5 +5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8 +5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8 +5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5 +5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5 +5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10 +5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5 +5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10 +5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8 +5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8 +5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8 +5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8 +5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10 +5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8 +5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10 +5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5 +5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8 +5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8 +5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8 +5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8 +5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8 +5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5 +5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5 +5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10 +5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8 +5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8 +5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8 +5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8 +5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10 +5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8 +5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10 +5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5 +5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8 +5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8 +5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5 +5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5 +5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10 +5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5 +5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10 +5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8 +5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8 +5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8 +5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8 +5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10 +5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8 +5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10 +5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5 +5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8 +5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8 +5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8 +5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8 +5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8 +5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5 +5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5 +5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10 +5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8 +5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8 +5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8 +5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8 +5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10 +5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8 +5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10 +5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5 +5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8 +5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8 +5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5 +5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5 +5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10 +5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5 +5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10 +5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8 +5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8 +5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8 +5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8 +5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10 +5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8 +5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10 +5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5 +5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8 +5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8 +5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8 +5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8 +5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8 +5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5 +5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5 +5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10 +5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8 +5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8 +5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8 +5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8 +5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10 +5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8 +5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10 +5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5 +5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8 +5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8 +5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5 +5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5 +5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10 +5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5 +5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10 +5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8 +5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8 +5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8 +5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8 +5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10 +5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8 +5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10 +5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5 +5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8 +5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8 +5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8 +5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8 +5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8 +5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5 +5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5 +5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10 +5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8 +5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8 +5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8 +5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8 +5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10 +5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8 +5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10 +5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5 +5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8 +5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8 +5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5 +5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5 +5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10 +5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5 +5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10 +5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8 +5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8 +5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8 +5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8 +5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10 +5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8 +5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10 +5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5 +5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8 +5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8 +5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8 +5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8 +5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8 +5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5 +5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5 +5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10 +5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8 +5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8 +5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8 +5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8 +5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10 +5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8 +5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10 +5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5 +6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8 +6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8 +6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5 +6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5 +6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10 +6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5 +6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10 +6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8 +6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8 +6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8 +6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8 +6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10 +6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8 +6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10 +6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5 +6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8 +6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8 +6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8 +6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8 +6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8 +6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5 +6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5 +6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10 +6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8 +6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8 +6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8 +6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8 +6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10 +6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8 +6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10 +6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5 +6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8 +6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8 +6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5 +6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5 +6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10 +6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5 +6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10 +6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8 +6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8 +6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8 +6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8 +6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10 +6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8 +6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10 +6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5 +6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8 +6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8 +6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8 +6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8 +6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8 +6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5 +6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5 +6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10 +6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8 +6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8 +6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8 +6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8 +6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10 +6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8 +6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10 +6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5 +6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8 +6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8 +6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5 +6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5 +6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10 +6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5 +6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10 +6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8 +6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8 +6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8 +6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8 +6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10 +6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8 +6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10 +6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5 +6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8 +6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8 +6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8 +6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8 +6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8 +6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5 +6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5 +6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10 +6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8 +6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8 +6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8 +6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8 +6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10 +6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8 +6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10 +6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5 +6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8 +6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8 +6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5 +6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5 +6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10 +6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5 +6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10 +6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8 +6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8 +6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8 +6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8 +6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10 +6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8 +6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10 +6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5 +6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8 +6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8 +6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8 +6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8 +6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8 +6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5 +6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5 +6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10 +6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8 +6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8 +6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8 +6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8 +6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10 +6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8 +6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10 +6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5 +6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8 +6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8 +6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5 +6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5 +6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10 +6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5 +6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10 +6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8 +6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8 +6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8 +6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8 +6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10 +6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8 +6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10 +6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5 +6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8 +6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8 +6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8 +6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8 +6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8 +6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5 +6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5 +6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10 +6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8 +6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8 +6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8 +6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8 +6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10 +6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8 +6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10 +6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5 +6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8 +6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8 +6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5 +6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5 +6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10 +6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5 +6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10 +6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8 +6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8 +6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8 +6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8 +6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10 +6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8 +6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10 +6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5 +6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8 +6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8 +6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8 +6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8 +6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8 +6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5 +6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5 +6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10 +6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8 +6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8 +6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8 +6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8 +6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10 +6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8 +6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10 +6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5 +6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8 +6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8 +6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5 +6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5 +6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10 +6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5 +6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10 +6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8 +6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8 +6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8 +6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8 +6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10 +6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8 +6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10 +6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5 +6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8 +6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8 +6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8 +6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8 +6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8 +6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5 +6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5 +6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10 +6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8 +6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8 +6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8 +6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8 +6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10 +6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8 +6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10 +6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5 +6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8 +6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8 +7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5 +7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5 +7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10 +7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5 +7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10 +7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8 +7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8 +7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8 +7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8 +7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10 +7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8 +7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10 +7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5 +7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8 +7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8 +7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8 +7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8 +7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8 +7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5 +7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5 +7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10 +7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8 +7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8 +7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8 +7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8 +7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10 +7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8 +7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10 +7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5 +7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8 +7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8 +7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5 +7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5 +7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10 +7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5 +7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10 +7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8 +7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8 +3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8 +7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8 +3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8 +7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8 +7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10 +7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8 +3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5 +3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8 +3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8 +3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8 +3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8 +3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8 +3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8 +3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8 +3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8 +3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8 +3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8 +3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8 +3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8 +4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8 +4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8 +4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8 +4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8 +4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8 +4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8 +4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8 +4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8 +4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8 +4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8 +4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8 +4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8 +4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8 +4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8 +5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8 +5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8 +5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8 +5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8 +5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8 +5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8 +5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8 +5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8 +5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8 +5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8 +5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8 +5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8 +5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8 +5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8 +6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8 +6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8 +6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8 +6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8 +6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8 +6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8 +6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8 +6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8 +6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8 +6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8 +6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8 +6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8 +6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8 +6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8 +7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8 +7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8 +7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8 +7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10 +3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5 +3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5 +3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5 +3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5 +3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5 +3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5 +3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5 +3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5 +3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5 +3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5 +3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5 +3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5 +3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5 +3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5 +3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5 +3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5 +3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5 +3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5 +3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5 +3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5 +3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5 +3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5 +3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5 +3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5 +3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5 +3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5 +3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5 +3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5 +3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5 +3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5 +3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5 +3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5 +3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5 +3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5 +3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5 +3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5 +3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5 +3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5 +3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5 +4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5 +4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5 +4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5 +4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5 +4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5 +4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5 +4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5 +4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5 +4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5 +4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5 +4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5 +4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5 +4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5 +4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5 +4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5 +4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5 +4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5 +4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5 +4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5 +4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5 +4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5 +4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5 +4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5 +4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5 +4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5 +4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5 +4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5 +4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5 +4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5 +4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5 +4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5 +4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5 +4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5 +4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5 +4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5 +4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5 +4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5 +4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5 +4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5 +4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5 +4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5 +4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5 +4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5 +4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5 +4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5 +4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5 +4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5 +4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5 +4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5 +5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5 +5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5 +5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5 +5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5 +5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5 +5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5 +5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5 +5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5 +5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5 +5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5 +5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5 +5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5 +5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5 +5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5 +5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5 +5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5 +5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5 +5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5 +5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5 +5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5 +5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5 +5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5 +5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5 +5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5 +5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5 +5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5 +5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5 +5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5 +5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5 +5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5 +5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5 +5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5 +5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5 +5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5 +5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5 +5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5 +5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5 +5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5 +5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5 +5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5 +5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5 +5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5 +5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5 +5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5 +5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5 +5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5 +5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5 +5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5 +5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5 +6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5 +6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5 +6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5 +6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5 +6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5 +6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5 +6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5 +6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5 +6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5 +6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5 +6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5 +6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5 +6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5 +6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5 +6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5 +6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5 +6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5 +6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5 +6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5 +6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5 +6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5 +6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5 +6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5 +6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5 +6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5 +6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5 +6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5 +6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5 +6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5 +6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5 +6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5 +6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5 +6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5 +6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5 +6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5 +6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5 +6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5 +6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5 +6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5 +6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5 +6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5 +6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5 +6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5 +6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5 +6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5 +6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5 +6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5 +6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5 +6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5 +7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5 +7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5 +7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5 +7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5 +7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5 +7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5 +7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5 +7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5 +7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5 +7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5 +7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5 +3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5 +7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9 diff --git a/profile_results/real_llama3_by_mesh_dim.svg b/profile_results/real_llama3_by_mesh_dim.svg new file mode 100644 index 00000000..6eb41508 --- /dev/null +++ b/profile_results/real_llama3_by_mesh_dim.svg @@ -0,0 +1,167 @@ + + +Real Llama3 optimizer profile vs mesh dimension +Y axes are log scale. Missing series points timed out or were not run. + +model_key=1B + +model_key=3B + +model_key=405B + +model_key=70B + +model_key=8B +strategy enum (s) + + + +9.9 +0.54 +1 +2 + + + + + + + + + + + +cost estimation (s) + + + +5 +0.54 +1 +2 + + + + + + + + + + + +ILP construction (s) + + + +14 +0.26 +1 +2 + + + + + + + + + + + +objective build (s) + + + +3.3 +0.053 +1 +2 + + + + + + + + + + + +solve (s) + + + +86 +0.49 +1 +2 + + + + + + + + + + + +pipeline total (s) + + + +124 +3 +1 +2 + + + + + + + + + + + +unique ILP vars + + + +488.5K +13.0K +1 +2 + + + + + + + + + + + +constraints + + + +177.2K +7.0K +1 +2 + + + + + + + + + + + + \ No newline at end of file diff --git a/profile_results/real_llama3_by_model_size.svg b/profile_results/real_llama3_by_model_size.svg new file mode 100644 index 00000000..11fabae2 --- /dev/null +++ b/profile_results/real_llama3_by_model_size.svg @@ -0,0 +1,177 @@ + + +Real Llama3 optimizer profile vs model size +Y axes are log scale. Missing series points timed out or were not run. + +mesh_ndim=1 + +mesh_ndim=2 +strategy enum (s) + + + +9.9 +0.54 +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +cost estimation (s) + + + +5 +0.54 +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +ILP construction (s) + + + +14 +0.26 +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +objective build (s) + + + +3.3 +0.053 +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +solve (s) + + + +86 +0.49 +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +pipeline total (s) + + + +124 +3 +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +unique ILP vars + + + +488.5K +13.0K +1.2 +3.2 +8 +71 +406 + + + + + + + + + + +constraints + + + +177.2K +7.0K +1.2 +3.2 +8 +71 +406 + + + + + + + + + + + \ No newline at end of file diff --git a/profile_results/real_llama3_dag_analysis.py b/profile_results/real_llama3_dag_analysis.py new file mode 100644 index 00000000..03b445a3 --- /dev/null +++ b/profile_results/real_llama3_dag_analysis.py @@ -0,0 +1,255 @@ +import csv +import json +import logging +import re +import sys +import time +from collections import Counter, defaultdict +from pathlib import Path + +import networkx as nx +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +sys.path.insert(0, "/home/wangkj/workspace/torchtitan") + +from torchtitan.models.llama3 import llama3_configs # noqa: E402 + +from autoparallel.api import AutoParallel +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + + +WORLD_SIZE = 64 +SEQ_LEN = 256 +GLOBAL_BATCH = 64 + + +def init_dist(): + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group( + "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE + ) + + +def target_name(node): + target = node.target + if hasattr(target, "__name__"): + return target.__name__ + return str(target) + + +def layer_id(node): + stacks = [] + for key in ("nn_module_stack", "fwd_nn_module_stack"): + value = node.meta.get(key) + if value: + stacks.append(str(value)) + text = " ".join(stacks) + match = re.search(r"layers[._']+([0-9]+)", text) + return int(match.group(1)) if match else "" + + +def phase(node): + if "fwd_nn_module_stack" in node.meta: + return "backward" + if "nn_module_stack" in node.meta: + return "forward" + if node.op == "placeholder" and str(node.name).startswith("tangents"): + return "backward" + return "unknown" + + +def bitset_counts(nodes, edges): + idx = {node: i for i, node in enumerate(nodes)} + children = [[] for _ in nodes] + parents = [[] for _ in nodes] + for src, dst in edges: + children[idx[src]].append(idx[dst]) + parents[idx[dst]].append(idx[src]) + + descendants = [0] * len(nodes) + for i in range(len(nodes) - 1, -1, -1): + bits = 0 + for child in children[i]: + bits |= 1 << child + bits |= descendants[child] + descendants[i] = bits + + ancestors = [0] * len(nodes) + for i in range(len(nodes)): + bits = 0 + for parent in parents[i]: + bits |= 1 << parent + bits |= ancestors[parent] + ancestors[i] = bits + + return ( + [bits.bit_count() for bits in ancestors], + [bits.bit_count() for bits in descendants], + ) + + +def treewidth_upper_bounds(edges): + graph = nx.Graph() + graph.add_edges_from(edges) + width_min_fill, _ = nx.approximation.treewidth_min_fill_in(graph) + width_min_degree, _ = nx.approximation.treewidth_min_degree(graph) + + moral = graph.copy() + parents_by_child = defaultdict(list) + for src, dst in edges: + parents_by_child[dst].append(src) + for parents in parents_by_child.values(): + for i, left in enumerate(parents): + for right in parents[i + 1 :]: + moral.add_edge(left, right) + moral_width_min_fill, _ = nx.approximation.treewidth_min_fill_in(moral) + moral_width_min_degree, _ = nx.approximation.treewidth_min_degree(moral) + return { + "undirected_min_fill": width_min_fill, + "undirected_min_degree": width_min_degree, + "moralized_min_fill": moral_width_min_fill, + "moralized_min_degree": moral_width_min_degree, + "undirected_edges": graph.number_of_edges(), + "moralized_edges": moral.number_of_edges(), + } + + +def run_analysis(out_dir): + init_dist() + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", (64,), mesh_dim_names=("dp",) + ) + set_nccl_topo_config(detect_nccl_topo_config(mesh)) + + config = llama3_configs["3B"](attn_backend="sdpa") + config.rope.max_seq_len = SEQ_LEN + with torch.device("meta"): + model = config.build() + + def input_fn(): + return torch.randint(0, config.vocab_size, (GLOBAL_BATCH, SEQ_LEN), device="cuda") + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, reduce_dtype=torch.float32 + ) + t0 = time.perf_counter() + with AutoParallel( + model, input_fn, mesh, mp_policy, repeated_subgraphs=True + ) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0),)]) + autop.add_output_constraints([(Shard(0),)]) + opt = autop.sharding_optimizer + + ilp_nodes = [node for node in opt.nodes if node.op != "output"] + ilp_node_set = set(ilp_nodes) + edges = [] + dep_args = {} + dep_unique = {} + for node in ilp_nodes: + inputs = [inp for inp in opt._all_input_nodes(node) if inp in ilp_node_set] + dep_args[node] = len(inputs) + dep_unique[node] = len(set(inputs)) + for inp in set(inputs): + edges.append((inp, node)) + + offspring = Counter() + for src, _dst in edges: + offspring[src] += 1 + + ancestor_counts, descendant_counts = bitset_counts(ilp_nodes, edges) + node_to_idx = {node: i for i, node in enumerate(ilp_nodes)} + treewidth = treewidth_upper_bounds(edges) + + rows = [] + for node in ilp_nodes: + idx = node_to_idx[node] + rows.append( + { + "idx": idx, + "name": node.name, + "op": node.op, + "target": target_name(node), + "phase": phase(node), + "layer": layer_id(node), + "direct_dependency_args": dep_args[node], + "direct_dependency_nodes": dep_unique[node], + "direct_offspring_nodes": offspring[node], + "ancestor_count": ancestor_counts[idx], + "descendant_count": descendant_counts[idx], + "strategy_count": len(opt.strats[node].strategies), + } + ) + + merge_points = [ + row for row in rows if int(row["direct_dependency_nodes"]) > 1 + ] + merge_points.sort( + key=lambda row: ( + -int(row["direct_dependency_nodes"]), + -int(row["descendant_count"]), + int(row["idx"]), + ) + ) + fanout_points = sorted( + rows, + key=lambda row: (-int(row["direct_offspring_nodes"]), int(row["idx"])), + ) + + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + node_csv = out_dir / "real_llama3_3b_dag_node_stats.csv" + with node_csv.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(rows) + + merge_csv = out_dir / "real_llama3_3b_merge_points.csv" + with merge_csv.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(merge_points) + + summary = { + "model": "LLaMA3 3B", + "mesh": "1D 64", + "trace_and_optimizer_build_s": time.perf_counter() - t0, + "ilp_nodes": len(ilp_nodes), + "dag_edges": len(edges), + "merge_points": len(merge_points), + "branch_points": sum(1 for row in rows if int(row["direct_offspring_nodes"]) > 1), + "max_direct_dependency_nodes": max(int(row["direct_dependency_nodes"]) for row in rows), + "max_direct_offspring_nodes": max(int(row["direct_offspring_nodes"]) for row in rows), + "max_ancestor_count": max(int(row["ancestor_count"]) for row in rows), + "max_descendant_count": max(int(row["descendant_count"]) for row in rows), + "treewidth_upper_bounds": treewidth, + "direct_dependency_histogram": dict( + sorted(Counter(int(row["direct_dependency_nodes"]) for row in rows).items()) + ), + "direct_offspring_histogram": dict( + sorted(Counter(int(row["direct_offspring_nodes"]) for row in rows).items()) + ), + "top_merge_points": merge_points[:30], + "top_fanout_points": fanout_points[:30], + "node_stats_csv": str(node_csv), + "merge_points_csv": str(merge_csv), + } + summary_path = out_dir / "real_llama3_3b_dag_summary.json" + summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True)) + print(json.dumps(summary, indent=2, sort_keys=True)) + + +def main(): + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s:%(name)s:%(message)s", + ) + run_analysis("profile_results") + + +if __name__ == "__main__": + main() diff --git a/profile_results/real_llama3_optimizer_presolve_3d4d.log b/profile_results/real_llama3_optimizer_presolve_3d4d.log new file mode 100644 index 00000000..923ec1f1 --- /dev/null +++ b/profile_results/real_llama3_optimizer_presolve_3d4d.log @@ -0,0 +1,7 @@ +[14:50:20] start model=1B mesh_ndim=3 skip_solve timeout=1200s +2026-05-26 14:50:29,648 INFO:autoparallel.api:Graph tracing took 6.073s +2026-05-26 14:58:18,227 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=strategy_enumeration mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B graph_nodes=4140 strategy_options=662279 option_tuples=181062856 elapsed=459.509s +2026-05-26 15:07:42,067 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=decision_vars mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B unique_ilp_vars=20390366 logical_decision_vars=181062856 cluster_copied_decision_vars=160672490 elapsed=462.310s +[15:10:23] done model=1B mesh_ndim=3 rc=124 +[15:10:23] start model=1B mesh_ndim=4 skip_solve timeout=1200s +2026-05-26 15:10:32,788 INFO:autoparallel.api:Graph tracing took 6.079s diff --git a/profile_results/real_llama3_optimizer_sweep.csv b/profile_results/real_llama3_optimizer_sweep.csv new file mode 100644 index 00000000..30d2e4f5 --- /dev/null +++ b/profile_results/real_llama3_optimizer_sweep.csv @@ -0,0 +1,9 @@ +cluster_copied_decision_vars,compute_cost_estimation_s,constraints_init,constraints_solve,cost_estimation_s,decision_var_build_s,decision_var_overhead_s,edge_cost_estimation_s,extract_s,graph_nodes,ilp_construction_s,logical_decision_vars,max_strategies_per_node,mesh_ndim,mesh_shape,mesh_size,model_key,objective,objective_s,optimizer_pipeline_s,option_tuples,parameter_b,parameter_gib,parameter_nodes,parameter_numel,solve_s,status,strategy_enumeration_s,strategy_options,tensor_nodes,total_wall_s,unique_ilp_vars,validation_s +101888,0.4790569522883743,7038,7042,0.5384756466373801,0.6978608381468803,0.09741721651516855,0.05941869434900582,0.01627982617355883,4140,0.26339267240837216,114928,10,1,64,64,1B,75411.02054353141,0.053351440001279116,3.032338660908863,114928,1.2358144,4.603767395019531,146,1235814400,0.49132931185886264,Optimal,0.6722944700159132,18503,4139,8.946083615999669,13040,0.31402136106044054 +194792,0.48607375379651785,8080,8084,0.5489266884978861,0.7471572819631547,0.1306607834994793,0.06285293470136821,0.029804171063005924,7200,0.3148333504796028,208698,10,1,64,64,3B,155857.5709074804,0.05978171294555068,4.169702837942168,208698,3.212749824,11.968425750732422,254,3212749824,0.5855939809698611,Optimal,0.5360530489124358,32969,7199,14.472710577072576,13906,0.03955800808034837 +224240,0.49045672081410885,8372,8376,0.5536671618465334,1.1619730349630117,0.5399796243291348,0.06321044103242457,0.03362119919620454,8220,0.7288401401601732,238203,10,1,64,64,8B,213343.3574716149,0.05892709596082568,4.762722868937999,238203,8.030261248,29.915054321289062,291,8030261248,0.5859912640880793,Optimal,0.9387421838473529,37635,8219,16.452271425863728,13963,0.045778295025229454 +596400,0.5983547926880419,12044,12048,0.6777467841748148,2.653722374001518,1.875488000921905,0.0793919914867729,0.2056352950166911,20460,2.2220516917295754,612283,10,1,64,64,70B,965500.0409067452,0.0730493909213692,20.028923405101523,612283,70.553706496,262.8330383300781,723,70553706496,1.5257919810246676,Optimal,3.3026473850477487,95379,20459,50.90600106609054,15883,0.1628595821093768 +946046,0.4775047143921256,15494,15498,0.5445251299533993,2.283439102116972,1.6354041469749063,0.0670204155612737,0.17483325605280697,32190,2.005914915120229,963447,10,1,64,64,405B,3172012.7008089907,0.06962158717215061,29.85055986023508,963447,405.8533888,1511.9216918945312,1137,405853388800,2.56223003892228,Optimal,2.5583339028526098,150073,32189,77.86599416891113,17401,0.18959671608172357 +3854214,1.9979437342844903,173186,173190,4.75627763918601,11.933482899097726,4.112962566781789,2.7583339049015194,0.03040059795603156,4140,10.42051934893243,4337060,82,2,8x8,64,1B,57041.81060181375,2.17517895414494,109.2090197771322,4337060,1.2358144,4.603767395019531,146,1235814400,80.18635749211535,Optimal,8.398531069047749,107753,4139,115.10326781589538,482846,0.024392321007326245 +7135218,2.101260715862736,176564,176568,5.0140090675558895,14.759843383915722,6.347998866345733,2.9127483516931534,0.04800663981586695,7200,14.323183785192668,7623714,82,2,8x8,64,3B,122291.9385011857,2.4431078990455717,118.39831594773568,7623714,3.212749824,11.968425750732422,254,3212749824,78.84844117495231,Optimal,9.923545255092904,188315,7199,130.30269417585805,488496,0.053027451038360596 +8216282,1.9884659524541348,177172,177176,4.743945160182193,13.453818985959515,5.6245344209019095,2.755479207728058,0.04394924081861973,8220,11.563520586816594,8703393,82,2,8x8,64,8B,178228.3264244111,3.2896198199596256,123.55457829684019,8703393,8.030261248,29.915054321289062,291,8030261248,86.02262015617453,Optimal,9.262494687922299,214965,8219,135.2341975120362,487111,0.0497884638607502 diff --git a/profile_results/real_llama3_optimizer_sweep.jsonl b/profile_results/real_llama3_optimizer_sweep.jsonl new file mode 100644 index 00000000..67428955 --- /dev/null +++ b/profile_results/real_llama3_optimizer_sweep.jsonl @@ -0,0 +1,8 @@ +{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054} +{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837} +{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454} +{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768} +{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357} +{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245} +{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596} +{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502} diff --git a/profile_results/real_llama3_optimizer_sweep.log b/profile_results/real_llama3_optimizer_sweep.log new file mode 100644 index 00000000..21b02b4e --- /dev/null +++ b/profile_results/real_llama3_optimizer_sweep.log @@ -0,0 +1,54 @@ +[14:16:02] start model=1B mesh_ndim=1 timeout=900s +2026-05-26 14:16:10,889 INFO:autoparallel.api:Graph tracing took 5.582s +2026-05-26 14:16:13,492 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=18503 option_tuples=114928 unique_ilp_vars=13040 logical_decision_vars=114928 constraints=7038 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,validation=0.314s,total=2.469s} +2026-05-26 14:16:14,059 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B unique_ilp_vars=13040 constraints=7042 status=Optimal objective=75411.0205 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,objective=0.053s,solve=0.491s,extract=0.016s,total_solve_call=0.563s,total_pipeline=3.032s} +{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054} +[14:16:15] done model=1B mesh_ndim=1 rc=0 +[14:16:15] start model=3B mesh_ndim=1 timeout=900s +2026-05-26 14:16:27,671 INFO:autoparallel.api:Graph tracing took 9.505s +2026-05-26 14:16:31,732 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=32969 option_tuples=208698 unique_ilp_vars=13906 logical_decision_vars=208698 constraints=8080 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,validation=0.040s,total=3.492s} +2026-05-26 14:16:32,416 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B unique_ilp_vars=13906 constraints=8084 status=Optimal objective=155857.5709 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,objective=0.060s,solve=0.586s,extract=0.030s,total_solve_call=0.678s,total_pipeline=4.170s} +{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837} +[14:16:33] done model=3B mesh_ndim=1 rc=0 +[14:16:33] start model=8B mesh_ndim=1 timeout=900s +2026-05-26 14:16:47,847 INFO:autoparallel.api:Graph tracing took 11.170s +2026-05-26 14:16:52,205 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=37635 option_tuples=238203 unique_ilp_vars=13963 logical_decision_vars=238203 constraints=8372 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,validation=0.046s,total=4.081s} +2026-05-26 14:16:52,893 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B unique_ilp_vars=13963 constraints=8376 status=Optimal objective=213343.3575 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,objective=0.059s,solve=0.586s,extract=0.034s,total_solve_call=0.681s,total_pipeline=4.763s} +{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454} +[14:16:54] done model=8B mesh_ndim=1 rc=0 +[14:16:54] start model=70B mesh_ndim=1 timeout=900s +2026-05-26 14:17:27,109 INFO:autoparallel.api:Graph tracing took 29.053s +2026-05-26 14:17:46,179 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B param_nodes=723 graph_nodes=20460 tensor_nodes=20459 strategy_options=95379 option_tuples=612283 unique_ilp_vars=15883 logical_decision_vars=612283 constraints=12044 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,validation=0.163s,total=18.219s} +2026-05-26 14:17:48,011 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B unique_ilp_vars=15883 constraints=12048 status=Optimal objective=965500.0409 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,objective=0.073s,solve=1.526s,extract=0.206s,total_solve_call=1.810s,total_pipeline=20.029s} +{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768} +[14:17:51] done model=70B mesh_ndim=1 rc=0 +[14:17:51] start model=405B mesh_ndim=1 timeout=900s +2026-05-26 14:18:40,587 INFO:autoparallel.api:Graph tracing took 45.218s +2026-05-26 14:19:09,868 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B param_nodes=1137 graph_nodes=32190 tensor_nodes=32189 strategy_options=150073 option_tuples=963447 unique_ilp_vars=17401 logical_decision_vars=963447 constraints=15494 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,validation=0.190s,total=27.039s} +2026-05-26 14:19:12,705 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B unique_ilp_vars=17401 constraints=15498 status=Optimal objective=3172012.7008 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,objective=0.070s,solve=2.562s,extract=0.175s,total_solve_call=2.811s,total_pipeline=29.851s} +{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357} +[14:19:15] done model=405B mesh_ndim=1 rc=0 +[14:19:15] start model=1B mesh_ndim=2 timeout=900s +2026-05-26 14:19:24,184 INFO:autoparallel.api:Graph tracing took 5.551s +2026-05-26 14:19:51,030 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=107753 option_tuples=4337060 unique_ilp_vars=482846 logical_decision_vars=4337060 constraints=173186 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,validation=0.024s,total=26.710s} +2026-05-26 14:21:13,538 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B unique_ilp_vars=482846 constraints=173190 status=Optimal objective=57041.8106 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,objective=2.175s,solve=80.186s,extract=0.030s,total_solve_call=82.499s,total_pipeline=109.209s} +{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245} +[14:21:16] done model=1B mesh_ndim=2 rc=0 +[14:21:16] start model=3B mesh_ndim=2 timeout=900s +2026-05-26 14:21:30,429 INFO:autoparallel.api:Graph tracing took 10.867s +2026-05-26 14:22:08,135 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=188315 option_tuples=7623714 unique_ilp_vars=488496 logical_decision_vars=7623714 constraints=176564 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,validation=0.053s,total=36.956s} +2026-05-26 14:23:29,596 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B unique_ilp_vars=488496 constraints=176568 status=Optimal objective=122291.9385 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,objective=2.443s,solve=78.848s,extract=0.048s,total_solve_call=81.443s,total_pipeline=118.398s} +{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596} +[14:23:32] done model=3B mesh_ndim=2 rc=0 +[14:23:32] start model=8B mesh_ndim=2 timeout=900s +2026-05-26 14:23:46,265 INFO:autoparallel.api:Graph tracing took 11.149s +2026-05-26 14:24:20,655 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=214965 option_tuples=8703393 unique_ilp_vars=487111 logical_decision_vars=8703393 constraints=177172 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,validation=0.050s,total=34.114s} +2026-05-26 14:25:50,114 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B unique_ilp_vars=487111 constraints=177176 status=Optimal objective=178228.3264 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,objective=3.290s,solve=86.023s,extract=0.044s,total_solve_call=89.441s,total_pipeline=123.555s} +{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502} +[14:25:52] done model=8B mesh_ndim=2 rc=0 +[14:25:52] start model=1B mesh_ndim=3 timeout=300s +2026-05-26 14:26:01,331 INFO:autoparallel.api:Graph tracing took 5.531s +[14:30:53] done model=1B mesh_ndim=3 rc=124 +[14:30:53] start model=1B mesh_ndim=4 timeout=300s +2026-05-26 14:31:01,610 INFO:autoparallel.api:Graph tracing took 5.635s +[14:35:53] done model=1B mesh_ndim=4 rc=124 diff --git a/profile_results/real_llama3_optimizer_sweep.py b/profile_results/real_llama3_optimizer_sweep.py new file mode 100644 index 00000000..7e32b14c --- /dev/null +++ b/profile_results/real_llama3_optimizer_sweep.py @@ -0,0 +1,351 @@ +import argparse +import csv +import json +import logging +import math +import sys +import time +from pathlib import Path + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +sys.path.insert(0, "/home/wangkj/workspace/torchtitan") + +from torchtitan.models.llama3 import llama3_configs # noqa: E402 + +from autoparallel.api import AutoParallel +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + + +WORLD_SIZE = 64 +SEQ_LEN = 256 +GLOBAL_BATCH = 64 +MESHES = { + 1: ((64,), ("dp",)), + 2: ((8, 8), ("dp", "tp")), + 3: ((4, 4, 4), ("dp", "tp", "cp")), + 4: ((4, 4, 2, 2), ("dp", "tp", "cp", "ep")), +} + + +def init_dist(): + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group( + "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE + ) + + +def flatten_profile(model_key, mesh_ndim, profile, total_wall_s, solve_ran): + model = profile["model"] + timings = profile["timings"] + strategies = profile["strategies"] + ilp = profile["ilp"] + solve = profile.get("last_solve", {}) + return { + "model_key": model_key, + "mesh_ndim": mesh_ndim, + "mesh_shape": "x".join(map(str, profile["mesh"]["shape"])), + "mesh_size": profile["mesh"]["size"], + "parameter_numel": model["parameter_numel"], + "parameter_b": model["parameter_numel"] / 1_000_000_000, + "parameter_gib": model["parameter_bytes"] / (1024**3), + "graph_nodes": model["graph_nodes"], + "tensor_nodes": model["tensor_nodes"], + "parameter_nodes": model["parameter_nodes"], + "strategy_options": strategies["strategy_options"], + "option_tuples": strategies["option_tuples"], + "max_strategies_per_node": strategies["max_strategies_per_node"], + "unique_ilp_vars": ilp["unique_variables"], + "logical_decision_vars": ilp["logical_decision_variables"], + "cluster_copied_decision_vars": ilp["cluster_copied_decision_variables"], + "constraints_init": ilp["constraints"], + "constraints_presolve": profile.get("constraints_presolve", ilp["constraints"]), + "constraints_solve": solve.get("constraints", ""), + "strategy_enumeration_s": timings["strategy_enumeration_s"], + "compute_cost_estimation_s": timings["compute_cost_estimation_s"], + "edge_cost_estimation_s": timings["edge_cost_estimation_s"], + "cost_estimation_s": timings["cost_estimation_s"], + "decision_var_build_s": timings["decision_var_build_s"], + "decision_var_overhead_s": timings["decision_var_overhead_s"], + "ilp_construction_s": timings["ilp_construction_s"], + "validation_s": timings["validation_s"], + "objective_s": solve.get("objective_s", ""), + "solve_s": solve.get("solve_s", ""), + "extract_s": solve.get("extract_s", ""), + "optimizer_pipeline_s": solve.get( + "pipeline_total_s", + timings["init_total_s"], + ), + "total_wall_s": total_wall_s, + "objective": solve.get("objective", ""), + "status": solve.get("status", "NotSolved"), + "solve_ran": solve_ran, + } + + +def run_one(model_key, mesh_ndim, skip_solve=False): + init_dist() + mesh_shape, mesh_dim_names = MESHES[mesh_ndim] + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", mesh_shape, mesh_dim_names=mesh_dim_names + ) + set_nccl_topo_config(detect_nccl_topo_config(mesh)) + + config = llama3_configs[model_key](attn_backend="sdpa") + config.rope.max_seq_len = SEQ_LEN + with torch.device("meta"): + model = config.build() + + def input_fn(): + return torch.randint( + 0, + config.vocab_size, + (GLOBAL_BATCH, SEQ_LEN), + device="cuda", + ) + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, reduce_dtype=torch.float32 + ) + t0 = time.perf_counter() + with AutoParallel( + model, + input_fn, + mesh, + mp_policy, + repeated_subgraphs=True, + ) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + input_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1) + if mesh.ndim == 1: + output_sharding = (Shard(0),) + else: + output_sharding = (Shard(0), Shard(2)) + (Replicate(),) * ( + mesh.ndim - 2 + ) + autop.add_input_constraints([input_sharding]) + autop.add_output_constraints([output_sharding]) + autop.sharding_optimizer.profile["constraints_presolve"] = len( + autop.sharding_optimizer.prob.constraints + ) + if not skip_solve: + autop.optimize_placement(verbose=False) + profile = autop.sharding_optimizer.profile + return flatten_profile( + model_key, + mesh_ndim, + profile, + time.perf_counter() - t0, + solve_ran=not skip_solve, + ) + + +def append_jsonl(path, row): + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a") as f: + f.write(json.dumps(row, sort_keys=True) + "\n") + + +def load_rows(path): + rows = [] + with Path(path).open() as f: + for line in f: + line = line.strip() + if line: + row = json.loads(line) + row.setdefault( + "constraints_presolve", + row.get("constraints_solve") or row.get("constraints_init"), + ) + row.setdefault("solve_ran", row.get("solve_s", "") != "") + rows.append(row) + rows.sort(key=lambda r: (r["mesh_ndim"], r["parameter_numel"])) + return rows + + +def write_csv(rows, path): + fields = [] + for row in rows: + for key in row: + if key not in fields: + fields.append(key) + with Path(path).open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fields) + writer.writeheader() + writer.writerows(rows) + + +def nice(v): + if v >= 1_000_000_000: + return f"{v / 1_000_000_000:.1f}B" + if v >= 1_000_000: + return f"{v / 1_000_000:.1f}M" + if v >= 1_000: + return f"{v / 1_000:.1f}K" + if v >= 10: + return f"{v:.0f}" + return f"{v:.2g}" + + +def write_svg(rows, path, x_key, series_key, title): + metrics = [ + ("strategy_enumeration_s", "strategy enum (s)"), + ("cost_estimation_s", "cost estimation (s)"), + ("ilp_construction_s", "ILP construction (s)"), + ("objective_s", "objective build (s)"), + ("solve_s", "solve (s)"), + ("optimizer_pipeline_s", "pipeline total (s)"), + ("unique_ilp_vars", "unique ILP vars"), + ("constraints_presolve", "constraints"), + ] + width = 1600 + height = 1000 + panel_w = 360 + panel_h = 180 + margin_l = 62 + margin_t = 120 + gap_x = 30 + gap_y = 50 + colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ea580c"] + + def sx(x, xs, px): + lo, hi = min(xs), max(xs) + if lo == hi: + return px + panel_w / 2 + return px + (x - lo) / (hi - lo) * panel_w + + def sy(y, ys, py): + positives = [v for v in ys if v > 0] + lo = min(positives) + hi = max(positives) + if lo == hi: + return py + panel_h / 2 + return py + panel_h - (math.log10(max(y, lo)) - math.log10(lo)) / ( + math.log10(hi) - math.log10(lo) + ) * panel_h + + series_values = sorted({r[series_key] for r in rows}) + x_values = sorted({float(r[x_key]) for r in rows}) + svg = [ + f'', + '', + f'{title}', + 'Y axes are log scale. Missing series points timed out or were not run.', + ] + for i, value in enumerate(series_values): + x = 32 + (i % 8) * 180 + y = 84 + (i // 8) * 20 + svg.append( + f'' + ) + svg.append( + f'{series_key}={value}' + ) + + for idx, (metric, label) in enumerate(metrics): + col = idx % 4 + row = idx // 4 + px = margin_l + col * (panel_w + gap_x) + py = margin_t + row * (panel_h + gap_y) + ys = [ + float(r[metric]) + for r in rows + if r.get(metric) not in {"", None} and float(r[metric]) > 0 + ] + if not ys: + continue + svg.extend( + [ + f'{label}', + f'', + f'', + f'', + f'{nice(max(ys))}', + f'{nice(min(ys))}', + ] + ) + for xv in x_values: + svg.append( + f'{nice(xv)}' + ) + for sidx, series in enumerate(series_values): + pts = sorted( + [r for r in rows if r[series_key] == series], + key=lambda r: float(r[x_key]), + ) + color = colors[sidx % len(colors)] + coords = [ + ( + sx(float(r[x_key]), x_values, px), + sy(float(r[metric]), ys, py), + ) + for r in pts + if r.get(metric) not in {"", None} and float(r[metric]) > 0 + ] + if len(coords) >= 2: + svg.append( + '' + ) + for x, y in coords: + svg.append(f'') + svg.append("") + Path(path).write_text("\n".join(svg)) + + +def plot(jsonl, out_dir): + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + rows = load_rows(jsonl) + write_csv(rows, out_dir / "real_llama3_optimizer_sweep.csv") + write_svg( + rows, + out_dir / "real_llama3_by_model_size.svg", + "parameter_b", + "mesh_ndim", + "Real Llama3 optimizer profile vs model size", + ) + write_svg( + rows, + out_dir / "real_llama3_by_mesh_dim.svg", + "mesh_ndim", + "model_key", + "Real Llama3 optimizer profile vs mesh dimension", + ) + + +def main(): + parser = argparse.ArgumentParser() + sub = parser.add_subparsers(dest="cmd", required=True) + run = sub.add_parser("run-one") + run.add_argument("--model-key", choices=llama3_configs.keys(), required=True) + run.add_argument("--mesh-ndim", type=int, choices=MESHES.keys(), required=True) + run.add_argument("--out-jsonl", required=True) + run.add_argument("--skip-solve", action="store_true") + plot_cmd = sub.add_parser("plot") + plot_cmd.add_argument("--jsonl", required=True) + plot_cmd.add_argument("--out-dir", required=True) + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s:%(name)s:%(message)s", + ) + logging.getLogger("autoparallel.optimize_sharding").setLevel(logging.INFO) + + if args.cmd == "run-one": + row = run_one(args.model_key, args.mesh_ndim, skip_solve=args.skip_solve) + append_jsonl(args.out_jsonl, row) + print(json.dumps(row, sort_keys=True)) + else: + plot(args.jsonl, args.out_dir) + + +if __name__ == "__main__": + main() diff --git a/profile_results/real_llama3_partial_presolve.csv b/profile_results/real_llama3_partial_presolve.csv new file mode 100644 index 00000000..ab7b7fa9 --- /dev/null +++ b/profile_results/real_llama3_partial_presolve.csv @@ -0,0 +1,3 @@ +model_key,mesh_ndim,mesh_shape,parameter_b,graph_nodes,strategy_options,option_tuples,strategy_enumeration_s,unique_ilp_vars,logical_decision_vars,cluster_copied_decision_vars,decision_var_build_s,constraints,solve_s,status +1B,3,4x4x4,1.2358144,4140,662279,181062856,459.509,20390366,181062856,160672490,462.310,,,timeout_before_constraints +1B,4,4x4x2x2,1.2358144,,,,,,,,,,,not_run diff --git a/profile_results/real_llama3_timeouts.csv b/profile_results/real_llama3_timeouts.csv new file mode 100644 index 00000000..c3e6c843 --- /dev/null +++ b/profile_results/real_llama3_timeouts.csv @@ -0,0 +1,3 @@ +model_key,mesh_ndim,mesh_shape,timeout_s,result +1B,3,4x4x4,1200,timeout_after_decision_vars_before_constraints +1B,4,4x4x2x2,,not_run diff --git a/pyproject.toml b/pyproject.toml index 31b0df19..3c5a55c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,3 +61,9 @@ exclude = [ "autoparallel/tools/overlap_simulator/repro_.*\\.py", "autoparallel/visualizer/build_display_from_json\\.py", ] + +[tool.pyrefly] +search-path = [ + "/home/wangkj/.conda/envs/pt-dev/lib/python3.12/site-packages", + "/data/users/wangkj/pytorch", +] diff --git a/qwen3_8b_autoparallel_30steps.log b/qwen3_8b_autoparallel_30steps.log new file mode 120000 index 00000000..5cc45d55 --- /dev/null +++ b/qwen3_8b_autoparallel_30steps.log @@ -0,0 +1 @@ +/tmp/qwen3_8b_autoparallel_30steps.log \ No newline at end of file diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.png b/qwen3_8b_autoparallel_30steps_loss_curve.png new file mode 120000 index 00000000..c8413f8d --- /dev/null +++ b/qwen3_8b_autoparallel_30steps_loss_curve.png @@ -0,0 +1 @@ +/tmp/qwen3_8b_autoparallel_30steps_loss_curve.png \ No newline at end of file diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.svg b/qwen3_8b_autoparallel_30steps_loss_curve.svg new file mode 120000 index 00000000..babd3d4e --- /dev/null +++ b/qwen3_8b_autoparallel_30steps_loss_curve.svg @@ -0,0 +1 @@ +/tmp/qwen3_8b_autoparallel_30steps_loss_curve.svg \ No newline at end of file diff --git a/qwen3_8b_autoparallel_30steps_losses.csv b/qwen3_8b_autoparallel_30steps_losses.csv new file mode 120000 index 00000000..47d30691 --- /dev/null +++ b/qwen3_8b_autoparallel_30steps_losses.csv @@ -0,0 +1 @@ +/tmp/qwen3_8b_autoparallel_30steps_losses.csv \ No newline at end of file diff --git a/qwen3_moe_mast_20steps_loss_curve.png b/qwen3_moe_mast_20steps_loss_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..8b4d9c43f227e00009f42077c6b257e19591586a GIT binary patch literal 19666 zcmch<1z1&GyEZxj2|+@60hJO1q(!6~MU)nh?vzHlK><;bl5Ujl?pPq*5{piyVbL9n zf2`%Zzx{pR-us;YKj&P>%j;sU8Doxkp8L6<81s{YoFpOsZF~rV2w%QBa_P3-zwq3m-xUUCKfAl ztLugl-rbKf<9HvhVW^iFZ?U>7b+P5LDwt6GJUIKk{qg1PuiU+hUq|0r*LOQWXGA0MO27|=2milaa(DW@ zWTh*cvR|mVC5GRC2L;-z)IhuizLLQ<6E*9@v3ta{Lcu|S-BIvTy>}k($hym|;QJTi z*pUT&*RB)OAg1i~`jO3*TQlC^73o?6v$7s^k8aF3AGh`XT#30k+7l}zC}?NY zf2i}$1DjxJYdk;IS&p6VnORg4|U!3Ct%bh zPWkIS%U>kOxp^L}56Qfqa_-B&8RTh2T-^@O(`U(%|Lw2sqPwZ-1TGFzzRzku5TppFYnKzH->xbeF;vEn3ri&c6 zeRRxdwcV7N2e+;CPG1yGR~@)U7t#uTi8yL$X=`n16Y}`Wb#a1_?pf|qP&yDl?vEpw zYMRHs0lgCrQHput;YuL|)I?hv-Uk~^LAm5Ag^I86!XhHdR{CKLYn;4{onsN-5H5wH z4Z=dTdAb~z3r(y|nTG{x`?-P#2OB4W7*C(4|Kb8KoT{)-K1?FZ<*D;HVM4&yChN_N z-)z51^lr;E^gl^W;$L^>YehaWQ0Q2_SXb@oPwdvjyKcAWHkyQ`6h{};RqJ**_vK|O zr;&?&XWV1^q`yjKq7xG8oPMh$CaxoA%(v^Zk#QP18ZXxx2xVx^6crTQS;a&@S*+xfzMu|Jwei9TBKfv_5{X^G%Kxy9Bu>_Al})r?!@y@`W`&XteT zKf5pQ+d114$j^F1y^8jHZD%m#sObf8VsG!rs+wUmtD`+T&NR1mo+{V&F=L* zlotAz@7aiWo=)ZMs@S+tM!?ob?e~w(c9C&#tl5+K?BlqM^a*8eN<91LZogHs>Janu z1$+F~1qHLjYlbT0(uHV zcW<1>if%^R`sj{0-d2JR1M%%M@5E{u81GcIyEpO|*(H^LqX^46eEnE!hHr1VTPZ_% z=d|2HIY;3+@}BRf01TBzXUaVD_D!r^r~I{d6IT@#(-lwhl9C1w0fxyZ*WdBm?puAG z_vX6U@&Kvnf^W!0{3lxB&vj&O%d=Q=-MGpGd3UT5U@p~SLx&==F-|~28RU~6+@}}z zzV3Lw5PMxj*8I?Hc^j9QP=%6`aeaxOhrhCitde&A3q<9+ zNA_vA5@wM2F`V9=*_;$TOoQg%4GlCrM=nNZk$Cm?@<{sZp{iba(hO=r50+m|smJJl zK&NO#Tusg+bSa-$vG>1>e+U=Z$aV>XuXB5!8W;Kogh)uX12j$)^3Lv0p?w|{Z?4jm z!Fln^_N1%4ja2}-v#hIHIQ!*Oko1cecp)yfyZa}z6s(u~-ASNnuJuQa{CFirIrbFq z7ZCIyKDb4Cbr&{2QILU9iTG1>i|4gFYQoZ)LCkz4f zCc>$~i~4fi#yjmvBQQVr+T{*!@rt{xY-|Xwo!M}jJlVp{|81I|n2_K{Ldk98eY$Z{ zPFWdDCBW!sYwu_^xoddwyO~ySWB;{xw*EcILrdu;Sro*7h$O&YIJz$WOtO7{jc0#w zG1mmH7v&s$`eB4Jv(C;ayb?57V&cAc83pFH_UMNqe=89F_zHxu?QzWh&mm6f@aG4z zg|_R#^&G`-2c3r62on+#gsrDU-Hvw`b9*O`(u^I*3snp@E8n%`g+(!)ysPtE-n@uP zcFdhgkw_L3VNol2%zK|2rr&eWU#!Ib{X0N-yVaJYpL40eFj0uV!94fAIpICDCL%{Y z;f9!Mx>q}TbXiloe~Tc*WqZia%p~zH8CYQVu=q-A;)}Nl`#3%u<0Wi~BEQ$heWksU z8{DQ%%=kx2dTMxkV(K69^FPPCuFzvMnn5<&M=khPZx2ku2cq*D@oL3e$crSN9k9U1 zAADa6dF?9C^d+(8HW}I(c~d$h3A)l-DHG;B+=^Jz3Q6W?zIYsZx3!ow)1PEufFqg> z7JHs`xE0WobkrW6X`^Gh5Q;o7S3TOcEqqlzSUGzxKjFH%3nz>xtRtQ3>@*u$NPdEW z2%dZp7*e`@HPs_5vTVBDAD+TGEWyoB{e&|nNEUmlHT7Q(dd)_^~GVL9+ zPv#9%XnL@hvAEx*ZxAN_k!yVx`K>}d{Db(%3^Bs;B)De?Og**KrTm6Hj1Wm3cuYGJSQOg5x%*L@ctl!1zLh`PvznQPeWr5YXX$js%sa=lb5u!ZKz`NO5|lGD+p zS4!E4fPwkSjZfj$UszNa7%0uQm5?rNUgLafc`!K36v=KlqVMBKVx({XHGI~(+xvN* zVs^qg&zn_w)xnVX1Wpr6g~E*UUy)1^k(JJyu9JIr z0F%`^zTnB_baSb4ud01H`RWx%#JZ$YHmNw&6Ul{M_?9Vkv*J{)||&$VGyQsgecuqaj_&Bc*h2KDFcJ&{El( z(Q->T@rBgu=!VVRH4=8+2DihhFAWP%kVWyxy8&f0c1N##Cw85Fe{hr)30nm!wYU4Chvb{h0kcln~&eU^@{CB0j9(rKgE zW?gqTefAkw?rX2vqk3TC(}MBno<(*o29mMfy@abp#l+mF4X1nhVQR)^wq|}~BZ1SD z$n0(UXyI>QS6S5VBqOT_OX6xirS`?a_r~{lbgdI%$>t(G<}0Q5Bco$t9M1ZtY1`$- zy;t6Nc$}BKUJnlsZ%6XBMBE*<9Ajc%9Lehp*hQKIQGPg|Oe}F~j`FInO}BQ4%2$bB zoyEzfFWk(5!+>nlv3f@lles!|^^A>1|mn4mVv`%>o|Q>bW8Z8#lL#cV-`LlWZ(|4{$)i0@|B?iJq|cCkaUw zyl`J<8)6RYKWK%mv%cWJ5a2W2Po3F`TZ+>E?i^fGv<0*{+_ZlIk(PG<Q|uXEk*Cd43CE{1F!)_C`(!)ST$W$tllVulJYh5YR zkt3X^t2r~{y^>XTKYU~5(-8ajnRBnMudPpOB;KdFMM~`+_RJzfHSm~TKU6B~I2*ju zxak?dhP}?+VbT-R#9%{XHrWe5IQ4Hn4ZIA%p`EU@D#s33P2fm9)D_>Fk>3%Ld0u8s zbKm*OQh#)%4&pWr@eqxkfx%#NN9B;&)vH%SXklhjeG#3l$N8m$eH~fM7LISH8a%fU zhtp?v%r9cU-lm9lmN^4hOm}%I^Fk)V^r*5x^ZOolz3qnVPeISa_jTX>uLInr^saPd zYRF)GZ9z^7BlKH*o^y3OS@`X+JN90;I}5+`8;qAPUuO2zRkNq425&Uuc-`~1$+Ye@ zvFO^`Tgv3(<2xBhtrW?hzfR05K+W6(_P<#%lV9-LjrjljWN$VbR4hvMFlW;vd)&Ay{EdP1)M2Y{L8CQ5ey+y zalB-~CD!`;JKMH4aKRT7n?DQE!zw#R3zf8IW@-!z>*@_d4jIzEA`MKVjE#m_n}SyU%s^Y{c$CW876>E6B3|XjqY8)TIqAQCg))Xq>!sW2dX1cBUiU>JEyB+6fjMR{pSBuk zY4+KF9_nA-8}me?!SSpmyOg2-nDxA=86Xq87XIq~>r3EzNXbXCYJXQfVlNZ<3Fx`_ z$>3mx!&Nq=oT}j()oKGiI=?CE7!Jb^Op#sPv}3@FboKY66ZvvsF`Xe97NXnW;Rwjq zZ-#!EKcu{EchDOzSK{Wz zjeHbo^ZWSi7{uuB0Z;c0Yj$&3k6~Spm&h~e(Xrj`*a3tPaA5Bw-lr3>V4RnQvf3XT z|5^&>w@K0xs;zfbI(+w3GMYtq5AeX!W%k~LQiTci8owpr#s(#DaB!>wQ*}|kNl2;C zBd;Oe@ez!h=VB5ol%ewIOoN)}E9Qyk0>yZVr!$a*?3;<{IkKM^0iaTvd3gM=A-KGj zx3_bYO1~lEu)hLfwK;rWzPQ&}`xgGb5Jv5_*5eO?L#w3}V6H`%V6BR}6$X5SaizAdCZWXRh0c20zJ*Iq&f`}Lzcc%joL5eR4olTbTx4z? zJ-sSh93iaLz#djX8$kGTwQo4;^Q$AEw@u9*krv$ebRR;Jcs=%?on%9ocVU21 zeHUF_jV~9MuQNt6OcpQuOl5z+@TRnS%@lcQF-=eROvZAmBJoVa)C# zqM0{1J4j{`U7(hGN!R+B{Fac1-f2Vv{ZpL%&7TL9t0HBo?egK9Ix5u$<61L1%j`NT zHT;G%N1j{Dv^d1Kh3Od?SHXH&!B*%HTT?qj>C7W9wmhFyJ{|1{DN?5h0OPWmE*_mT zcK{Cky?}k5KmL}2sBm(f1v0@I2%fB_bE2^OJQ!Kl%A4wZbA?$Np-TY@=G=_(xI7Z^ zwTC3)SB}q&`=efa>DD>%J!SSCL;8%R$Y#rregbwuE9kMmakAGF`*?ZxkZJA6D}U{^u-S;3&U*&!NMz4pT>n1mJ6^F%;9c2WayJk^}Xb=QG+f4s2D zN$kwy4f`P$wB|jMaUxJ_=uZOd<9LZ1IipsV(*YA=vLbXj_RCMCHf4-rUG6#k%vCy- z!pW&$ze8Adxnw2E&BAKJKm;>(d#GrxTh+Hv3xt35l^uElvsSL%N&pd6auBo8ztG6n ze8RCPZRq)$DO$JOsBc(zIho&Hn2zq79?}DZF^*qH_fI;m9c&V-XylTMZISix820_( zTxRdA%(5EHdRI57zE-24XmHx9f4=ffjCN$weLK0EZZvJnLsfP3{fQ?gbJSVMQ_U0d zwAZl4(v9>s(2y}iB1rfV^abq(5>+=X#>OcJh*Jh8hES@m@M_%KBr_y%bf z4KU50g!V7y8oQlBb)lgn;u1*Dc*vCfqs#WAl~gytFCv&TRb5(|7jt=fk}B0s9kue_ zg!Nh=Lsd^8Jt{GB2wK0z`S!HVzx$f-Yv3Vm?(djP_h8%FUQvgjJO+Atr^&wh_{y#7 zx13LY&8J?IyH!_TFJ+GT{xZ=biuiAhM0zM04v3NvI_%p{`XpGbpK&_?8HpQWnUFR{Ork&!ZE z01xRstGO}`=sB|9any#hEm^QC-8$bKgb8M+-Sx#k#*9^M&U%d2NQ4hbR%blV;7cio7IGcgF+fbL0FVYz#x}!9lEeZ=tW;95Ks?xB?#e zQsfcZo4}`(**_^4$Gr^vEdaz|rK>;)I+MuCpTx9Jn+gE+BJ-$Dy&E-^9hcxQkkr7w zwpr%#8a!Xb*=Oec384G2Ip$&jS1xW}G8YFUU$%6=O)~7`X-drO_i814@(u>Shp)x$ zPr43}zLL{5p6vh>&bWIgE6tJ;SoWPW>X|@WV?5Tc(5a$deLCRhza@zjA9pyd3w5ZH zuH;??J*qoIB5Du0EGE^J8u}zzg=edeQjoY#n;fF2qPntrom~f3brS^9;`|O587}_( zq4mq~NDQ7n z>vI3=L>bjFOoBX(Q1d+#ITGh&sbFzKAh5f zxG0#!pfU87#0>ir(}}8&0U`WO>zZSg3biA>3(>cQ-3(S0Zzul9yUa$-IX3sG#@uSG z@#g;9bm=#OQ_~vU774QqTdmS`QWTMQm2xgz7o6Wkc2(C_zs}Zbc(&sMAk2|yti&p$ zvQjmD*VvB)MBXf??>>|!CkDPN@ak%uE;2mEK>T1@OWQ&vKcUYMWa|`g;|WS8Xh2-l zoxKP4M*kwyi;Cw#idH^wp*N=N8&Caj8Kr5Uyvsr0F&9i%y-v4tP0UoW8Jj9mH#DpD zLabSETLyjlRPVZe7-pEcHd4Tb2>JXOX*`jzg?lvMZA^KNnqRedU-PlsTHD)?sGpR` zc86SCIHb^TMIcTHpd^X7p`cWR zj!u6e=IX)Ph}#MO%gga<*k!Tnq#Of7CJ|zCg=!nn6F|?LE=qjMN>|&(n!Tws>kPE% zo`ts&mYQ_aAub&^-%ph1h`62KJ)i%*vZ8iEcIq|#6mMsDvHR&I)z*OXN=vdN`N@%W zW6m5RQfRl6Gd))U$56Ef-OI2;T^?*@(!G*vP)eHB~$cVw?TOxNdRV)1bo+UQlxIyZEKpDlHIhv>YoKJxA zIHE63&~2h6Ec$7|!3IxRM4N7Hy5mH{g|Sp*mTv85lDojjc!f7vX?jg920Fe*VKx|^b(KD8--^F5=|!(ZZA!=Al$ z;mz8?*AJWZ+c%4_D83{X3nUeNe36^jO|D6BgEahgHoKJvCkf`)dm}!u28rY`Sk2bc(zq?_M6*x0R@IWRgciDn6BE<$(fd{1 zq`F~u@IklJydkb}N6Aabb|uR@_BKxc_l-;-YK?K0Y#sCbvhp&?`W4_8_uB@>e&kC(NyX8msV+OYn>(z94m4Kz^se6qdqAw?i(U?*|0uZ zYI4w+Yf6kyd4`tASRj$%8NIQ5&S7Qsmhc)HZ2^x_n5x>bFMw=(I?OKk*QDDVRW{4P z$z8OO_uMP+jinvexgN2ci-rC0H<6`LDsk~)Rn=^;?7Z~6m`-#*+B;c541ep*%#Z8p zFHnc24+J;@T}uxDVBHhXH^2On)3jH6VQEPt(**v`v*EGgSH)x$sB6weBqZcH5ORyT zm5zqnVn?0g2YxKmkT1*E-;h%McCD?zfehbv>>WW{-xzfWSG%=+KFZqoF#g|X+9~TL z@(&nGwt=~Z6oYO7(+p+Z;4Bfxn-_2?ExXRo)^Td;Jcb>P0YQWBZ;e+ns+dxOdn$Sc zeDl2Q%I%^KsO!{EO?$(^0*+y55GWd{RLN+|zwa|Dx*J3Y?qre(E&rmNrsEslWQ1y+ zl+P`ckeVWm^x@(|GnI@oOB+$C2cUJ)r4=!ibDr)D{>K>>BOazwatGs)+6Cas9ag*& z@rQr?mPx+mWg5!mB`TGg`ji6n^R}4EOL{pR%8;0)HZ6V!49LgxPDjK1BI%uos#jJM zrN0Gqo2HQ3H&_9TVZYD2kd|PA_cuJ#1cuxKucV6r%pp?SJjh|?c7)tO)Z`u}qw?mxz%d!T&1qNg*dPV&qBfN4M9-NbH zAwK7H4a@+w7pYqiJWnbr^NTho1LogNGCPwogzsv@UE$8d;_k(4m1U>bA{8dV+ruODH@Kn5b zD7K=JQMraVn8?rfIxy*rU9x1S6Yqdwe7Qpq6BMffDoilM$X}2uiRU-HzOLv7gf<4u zjGBzpHS2 zr6~4vU>=5K;X_5zfYplsGyy%Md(Sc_x}Ry{$WQVSbcO8I{|;$sYiT*8*}edQS=EAR zqt(ylBRLdBxD%alz3z8^nZ5))p3Gc`uV3m-xZ`(oXDAJr3s+NR<^74E$>)TZBZb>% zlTMomxXns%P`BqSbIQd#;2Q>k_)udo7mxMK=AgZg_rQ z*2mub|DX|rPTeL!rhV!zJBlBjLS!p(%KE_e6-X5T<3dO&Z#R4XHAYh{ z0=7<%@Cm$&057L|CR(jAsB{nK2r;#zgdnEksM>OFZXbSg{8B}}pKjKx_bax~dl$`{ zkSL=Za2=#Z^+@YHFo6;M@ifnS4QNI{Vu}GU9${?zHW?}Qmya$IgPVh4n{9l555GOm zaGAUuVqrn6>?NabX$1tO5s8tDx($3PJ?)HSA%I>Gyl3&*UyNY|ZUpn4e-wA+J@YnK zac{+%01&kVYQBnS5G=Vc>*zm%OgcVAcpVz6j94;0pm>r}O^)^!ePkc;iskP81wj*U zSRp+t{p(@Rli<3(QC?V(j~Gz1A_0YHe##-w0F z9jJ4{sB>31OCV*`eYQs6VFe-jHyDi^00G8AoM-NT@gaZ?{YEgM!{=)}p{Y4upW)*; zP}W~!PiW(o-GY2>gVVhD2o?+5R<>U(P!>VA z4TkL<#@_BV7SZlsT2Kl;c&*PS^^cfDZ0I$Zn?}kObx46ig8-TnxyDV2cS;Bp#}}KA3}rYE#;l*{83v?=nixI(iPF+j zv?&iY2LpRSG4WPKr24Bd#vR0ep$w~XVARiRh#-g_tfddv8&mbfzq&QFp-_Gn7&fG0 zV5<{F>5Czn16Bc&`Xz@Cq1!A%QAppjPo+>OK6phD85sT}0uUCkBmkyJ<{9wpHJ>G5 z`#7T-MGS_N;*(=R6(2yQR)w5E-(xNU2w*srg_Vc{rTBugIby$Apf|w!_;it&Vu%8p z1nLRf}B` z0K|N#AIBIdO<^x8$yiD16-IH`)15e8e|?uG^a!} zEQpuH9s`jD;$lW7!KU5Be<#&<>4nF|)RRwu6b$&<#^Kmu)u+@Bmx3 zPPEU?C5I=D^Le5G9- zI}l?o2FR4p@KA8+^9&PeC+fS<>Syrm3ub;o=;3>CR0Q2i#8E7JFpXKn3#7Qw0EpkT z%^eZxC~Q4-)cgEz$rLjpFi?CGFs^?tiM><*26iEBN-Tr}4F;ncm7GvlVQ4Lh!0>ru z2g+6-G^4(rdomUuwtWKifgMg;~GgC39sqvLI$ zcO58rn3Mpjcn@`?DG82zQAds`Ly!*<7^^53N+5zj&lPTgr(d8hz6S3@DQb#}!)=HN zH7W%ja4i7U*DbG>P&_49zqtS3;|5kqBz8v^mhKDhfw6^)DfzBKcEh z`_(Os6cmU7=$0$UGla%yP18XL=??X+SpUcpmZ35qHxx42vhedL=$Vr%u%i~_x>>tA z=s_fEZm1~Uk}Ghb&JL_PpaxlvGG(H_S-zO%%qbpTLT zpT%3TiQuXf&bW0w?uj|3Cl|;A1@jogSm^01j(oO69;(*bA?T%5^U9@)!Q2(GN>z{p zPg4w!<5%X(t5-i|Y(xjqh;Z%hgkuSeOjWtoO7~+RB?wfRbm=eFQMgF>O4mY6T>M8{ zufW;SW>mL;DM+%(-sVy(^$Hx6z<{=CPpOOd0V&LExZ{50*4o+o_ zrNKNVJK4e2t)slu8}ANRnu}RlCXoDSZq{~NS&e-_Pd}o_?j?L&ldjX{Frzb0##hLy z_fIZu%1diHbJE;JNL^t7tpt55U~_-joP^=x$b-B=DwIUupMrL6MS@8 z{3q}v4U>}^0CJD>_?+NnK)Upz-+&II)X(P!>L>tpwCjR<6K#bdl#k`3j#8vCk_;1! zAruA_@yYaJW}swVFEAz!dj?gk?}IQfCx3*Nby-7m@)uW8l1m2E2OW|ZrM@QD%k=u1 z9s#PQtgBWyC`SiTATH&*Zw;-%5sAmp9J7|sf`Mg8)2<4YSzzQT9FIr`f`)Cme zEad^e9csk<($fvhR^F^1h#~-v0~PTA^y=yU!G`MLANmE}bOY6-d+O|U=@6C;^bdg+ zejM8?c_?l0G@!;B6GA;~i#Ih$euy4=>KwOT5hqH2s0frwizv}YMBs=Ree{=7se%Bi zPr-rN>L8#1nwM-JW>H4LITRD;&xCq?Bc)8QxHf92bF3LA-|nKGOhMg`iMs!0<1mXQ zE$S%kW48JjdW4TYfFl-EyDmH#Ci-ZEY`ibi`(p$JwLZ5%%Lp`I-h0GF0Q_yBYS}w| z+2bCr_4%6DsQ}9ct=#~9BS#;7a*B$qOuwpO$2u5UItXrt;GB=y*Fl7)uhb_UMIs@V zm)_)2uIg!|sF(ihh70Mzt&i!FDCR z1&bAhVph=VU6#9wGb73V54!K7DPiO7s$^Cc6wpC=0W1UrDBy&02m4D})KN|WaPTQv zON;POZ&!d=OJATTRpFWy&I^JEzLu3UzM$9+ME$# z1Cb*85R5_G(?4V@-wn*s;(&>dveIs!&Oe9T50L}EGoK54+3B^%0oTcSm9G)@W?pKs zd&OfTjiXt!e6HaR5N#~RZu4bTYY0baOzZy=zI+@cyIFdS?ZpoN#e3afJBU98F~xr- zYmNlK$yxP}Qjd}79KrN8F#PRDLPN7T_*A~1g@dD*zx{t8ZhU7abL-@;)TJ)&#;OZ- z6+S3V@)%7|8ufA6Zq}X{&W;tU4YjJ49UT+R)D@>1dkts?`Fc+SkOQ&z%}r~Nmx>u< zQ5WSQU5LOy)Vu*-}v_Xyv19SAdao! zdqI>s?qvtJzJB)VBX=i;8)mmD%{3%98vM#aMPD(lGirZ5s|6So8=pB@fvhg>aOS2|7Rq%$RMaK zFsdMq%N10wVBjkg1J*~I+w%i2b78?K-JWtF04s-lgeR`0pzJp^{Ei!I1+9v3JZg#-bDU6=#`|=I8q)zc z0pQm+V{#M>Ldu_`)X~bTv0B;l;~V<0rc+&ICX}BrCZ*gR4GL}(N~IGgkt`OoD70ww zT47Kv$)o-+91kj@Jq^5Bl=tq>rP}X_T8<+e3R!p4DA3#mDk_TD)MvJOBx%8eVs}um zf3|up8YZ0NE^+jX)XzjwT1q4)(hZm`S}Upiy_{LpRnR1<@!d^J&O3cMz@Y`PFscj_ z1Qz<}?hJ+rF1lxL2TahU37Y%)h7(PRAX|sBTPPP8&{4dJYsw&e2SM97Ryb>DKyV4< z#v(({$jbL#Q8H8`5YA5%y)*RDhXE74-{_+@aIO@6v?Ut<&I%0^b@5HySikZ>AQe26 zVPHOjX4Oznqd?;yO4MlCHhFD#@u0#@A{hPN(A%e0I7N^DOojq8W4Dxjv0XuArpe2i zRdkTr=gGik9NhX*iZgdT+Cs2AMftBoWT^f_IVk=PoVvfS0ROdamRbAHujc~Q<}j=; zCgb~!yC4z_MMVsH!aP7c@BK&#dajFZWSFp`Tmj%FgDAvq1(JnDeMAx6`Bmw&;Hx)E z@+45p59PgMu|RnbAowu=fN;DFEk|m^tfl1W4iqtIn6RSQACv-3Ttl;YDCHVBLXR=! zCOATiR?4Fi%QbY15t7}OXf_U6VzQQ^nL9LwuVK>r-6#|>;vQN^|5KHe*I~g0aRD%= z9@pbita>8Sf=9i^ARmd_um`#V9o+4X>rij9@Bb&BH*qEzl6?E*xbq#gh4ob3bow%M z`Q0B>kr|GFaX>lb{A@1-U{aH(^|(ZROBw_>ZrtFuo*9QZSAxwISj>?YgIP^+2bw+n z;O0i6pO4SvxG9XbW{5+A0*9^~OA~WpNEGV8%+QS;zKc@-9bw7(=pi?^9Nzr=Je$z| ze$Ne3Qi~A`v$y{<*ZY&cVAVHlH$Zu@v{LF$;4sR~vwQcj%eowBZm8qGhxIsfKjEfH zFa{-~_#;@X&k3XCkDUG^D~HYdCzeHigTGJSl&xa*7jwn6+l;>MWZwm^O zL7~bX{0i97B0V8uV5@6WNX2#AYqz)Y`qPFB4R5o2XBzO7USU;ie$W4=1B+s>AhUSr zkV>m#m1aB=r|TSV0YdRmqX2*+{yQj}>!a~E6j7y(`=BMx06@{Hb9uQ9T0q@XlC==R z-8WET`^T;Mn@ly*tgsvC9d+ME%io3{imbPbe)~Ka{fB5Y8YtETTor%j=U)Uly452+b1 zDY=}aYKTU(&{IPh85%QIew0Ft`I%-GX0{RHi%WRMxUfc;XbAQ&J$bvf!69;u|E-?A ze@VJr0>6dFWnC;ONEtd=&PsXt6EvHDa;JFK`TSQi^7{+D%0k?LnQ8_<+rAi%C&yX) z?8n18&ODVNw?~S9WXYl2B3CO0h5BHF&>)}5?w47d!;>8I zMf^=`Yu0#Nc$Uiev6m-^k&8RC-x%M!x-9cuk8wUK|FFoI-wH>w0;jpJlY;cBg zRUX@PZ>$lYol=+PE^H9XPUds|7Ix95O*vh(w&(k+{_zja=rYN!5>P3a%QZD~^@YCMq<70jD8&w zyT#KX(|*&7iWQZW=|ih{=`#T!3Ba49d$vw4yTqn;I;Xz?>V*I|$J%tx4dCnb)yw0B7Pd3bqVog}xT*|O-^szyzf?{V! z--#!eWzz62GB*dr8;?s6A_KfRFgjnS!aNb+%lU%3{yg!B^^_afjpk!lo_1hB!JiJL;)Vbteg0%o z3cOBFUT5xz25`0ap%ffaOrzDbGBy9RXof&BS6kF6J4?{wRTocaf>vKh#=zo_?m|lA z%74_g@mfXrAB}$$62@ZD>)4Jqnvddl;70kV7%qsJg_h>kp;F8H{ruQy9CLQ$fCH=R<;qdEIikdeA%wz2OsG399~{s6$=P$|06)e zLXXbhpb5x7$JhSNIr=}o1Zt3G>Kto5GiTy<9nHPQ?1n0)PjMYKIT}?-A6_u4S2r9m z15d%H*G^@vw*27Ah+xT-?WuU`e4^15~+ytj}ef$nwicP;EMpA}9O1eZB0iS`-|)4(@;g73@y(LAkD< zRyA^ed`}(5T0`bm={dPHhy*28sG_s~A!dxPf_g*z_k}N(7yNKsugf7&dI4%X$}Q(* zoOyVFiW~K-e6!59uE+V{F9Npy{>`}|6Nx0|G7B)-5{0F zzgDIl^f!2pHkeomSagk^WVJ2}9MP0{VSKKoCI7~#n+Y5zDGGzvpfC#Ts+8UN>=3rVILD>j+%du7Pb%WEAn zi-m)W>dJ4J%EsCE=)p1S`5IH(jbPgW>DA>4a|(V*;u8Gb{hr6pr&WUwp3rOpo5$6+ z6@K(9og_W5BnK2`*H+hz@Z(Ff3SR=C$iuL z2H5C9$+NXFTdN87s)0H78s&DxqlGRC=o4JhOYA+6mR*xZ7f!wTJlod&|a3-VpR-Rodt2X(w#DPdL` zUK?A>qNDr8upaZ#jB5F$DU^hOk5AAuFl>!GzuAjxrBq=9VaM~HYJy0%F1H4-hA zzLj|-pK`dux;F>ZzAM+<#cuhx%sN^A3k)#g&-ZkXsMfw`Uf5gp5>Sh&=mTh~$Gzrk zZ%=)2Q7JWQAhgVOC0W?L!M^~=mKQWiaxcq@g(968Ilqb-J@=}O%nYByc>@F-`L#v zMMl9i^JNaL|M=@ajqv3Xe))eEhV9Z^KISS&D(|m+WYL}2J;^NTbfRxp(lNL2 z@G`c}T|{udubxe}Y|qvi{I{4{Xogj8ZVZ(M4mzr9{v(Y0A6fY&h0?ED`wK!ub_1kD z9@F*-fl0{zaxQiUqMcr&_^)5T;t)|%pDe5bKF4Wn_o{{}_cRZ|@U2NQ-Fx<6UA@#T z#aHX6KM5%Bj;+;O1>1>1tplq@O^G$ad<%KAHguN329*C2MCW}83Iv*6tiMN8?b-G1 ze~b`EMA{D6_XH-{4zDNqYVBV`YH99?FN4yq1UdG6{J%@h&Wrxeg(WJ@*3#s3a@Y zs-OQ;Nf!3>%-zl8RZZ5u@ismwDe2ktHb?Ctb=TVJ3D^gk?0x#l4m*Rw7t1*`a7Ms8 ze{4sBJ-t}qhhzU7&{v?f{pD~rhhQ4C>!65=G5<@x>i?}h{eOVSI1cMVZ(8KTSsZUd zcZ7JEwL$bzN5)cX2jB6UFuXxFFj>;ebkS|_I|wQ^Y?0mMNJ}2{2NwD5pYS?Jsd=BD zjDlLa|6|^w4hw_~T%sU~0pq%4ybnY!6Ty?FqI_q@|^01W{h2okD%QC4-jyXm=5O zeFC6*_=7-6uyYW6W8$DI%o9}l^Dk2KIyn3o9tQ%2SHhFM;k37kgZw~knIYAX2`wdRYv?MF5`#Ve|GE&cak1cbu3Z+kd^4Hd$!_Rd>E59v4t zeo9*9kgL7Dtbh|N%jneWcgGRNE4jRF4hTwove(jCnj6(KU4fJ*CO%4ZCtC5g1T%eP z(_k_Ey>s>6WK>`B&RwR{uJU!*xnLcrrB4(ni~P(2KEgp_aJqrbJZhOBY_?yWD$stu zJ?)nbL)IH-p7Vp#EUyIhW=CL4au4btUinxf9GP0WLxcCn|~7*P2)khVA6IbzPc*aqy3G~?Xu zbgDH|K3RI}RuwG`*zhjDoSNo^^%7tM*xN_gP}S7DIYST_62hK+HT2N5C+6ERuEc>z z1^Cnu_+EwCK*2ir9Ex(ba)XARgz9L|X@EBD)@@3PBZ(tn@sA5eVE`Wz63%s+&UY~$ zNMGrg1U2I5}+S)x{Z6&0kUo>RMU%uARIRbXZgZ&|(fLh39VXVc>{zEWTr33QIcZr!%J{CsC zmw3fi7s;pcorQ*q`A~`%(0p61>%_GF{*n&s!@}MTb$xw(AYTSzMJ!%XNBB$5Z{PBH zua@jT;^1K9U5IPC2R>*3w)~t~B^OrYYsN7+7reH(Bu6FCq~~A%Qf(Q9bL}-?U8XtI z>`}0FPcF0JEO#P$9azu>i*r?WUie_+d=jX$@bGms z(U3|1Y?Z5|gG0|n(MPZ+sxPKg!7I_8!gskx_XZi6=SE8`AB=rN*>su6>Z#4FYzQggNrf}ei1q%<<@xznl`3%X=@ej12Kayr z+6%T!oDtJFt{+l^PnaMkpV-W1RFiyH3aGfi3CUBae+Bm24en}!4J5s*k%nyv{$P^} zK!REz*0cL7EHJrvdEo|Ijyi2) zQ*!T}etvK5oM(;Tqdw~Akr<6QsI2$uW=C;2z!##^A{_(8UsKr9&G%@YoMa_|+lLnar+zY)j+kxzlBx*C5rT~5!e6Y bOBb}w9_0f@gzulAUh`5y?pcAj-uwRr+*T%z literal 0 HcmV?d00001 diff --git a/qwen3_moe_mast_20steps_loss_curve.svg b/qwen3_moe_mast_20steps_loss_curve.svg new file mode 100644 index 00000000..7fc6c0ca --- /dev/null +++ b/qwen3_moe_mast_20steps_loss_curve.svg @@ -0,0 +1,68 @@ + + + +Qwen3 MoE 30B-A3B MAST Training Loss + +9.5 + +10.0 + +10.5 + +11.0 + +11.5 + +12.0 + +12.5 + +1 + +2 + +4 + +6 + +8 + +10 + +12 + +14 + +16 + +18 + +20 + + +Training step +Loss + +step 1: 12.37845 +step 2: 12.36325 +step 3: 12.33137 +step 4: 12.28397 +step 5: 12.22048 +step 6: 12.14017 +step 7: 12.04897 +step 8: 11.94193 +step 9: 11.81908 +step 10: 11.68259 +step 11: 11.53297 +step 12: 11.37303 +step 13: 11.19815 +step 14: 11.02700 +step 15: 10.81583 +step 16: 10.61479 +step 17: 10.38304 +step 18: 10.15753 +step 19: 9.92291 +step 20: 9.66127 +12.37845 +9.66127 + \ No newline at end of file diff --git a/qwen3_moe_mast_20steps_losses.csv b/qwen3_moe_mast_20steps_losses.csv new file mode 100644 index 00000000..cf58cdd8 --- /dev/null +++ b/qwen3_moe_mast_20steps_losses.csv @@ -0,0 +1,21 @@ +step,loss +1,12.37845 +2,12.36325 +3,12.33137 +4,12.28397 +5,12.22048 +6,12.14017 +7,12.04897 +8,11.94193 +9,11.81908 +10,11.68259 +11,11.53297 +12,11.37303 +13,11.19815 +14,11.02700 +15,10.81583 +16,10.61479 +17,10.38304 +18,10.15753 +19,9.92291 +20,9.66127 From 2f4f102e60accd248dde4ca8c0a2557746ba7a0f Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Thu, 28 May 2026 12:44:47 -0700 Subject: [PATCH 03/27] Add LP relaxation support for sharding optimizer Adds LP-relaxation lower-bound plumbing and initial DP topology construction coverage, while removing generated profile/log artifacts from tracking and ignoring future outputs.\n\nAuthored with Claude. --- .gitignore | 2 + autoparallel/optimize_sharding.py | 207 +- autoparallel/serialization.py | 1 + ...ama3_3b_ilp_node_indegree_distribution.svg | 51 - .../llama3_8b_4x4_strategy_full.json | 287470 --------------- .../llama3_8b_4x4_strategy_summary.json | 2054 - .../real_llama3_3b_dag_node_stats.csv | 7200 - .../real_llama3_3b_dag_summary.json | 883 - .../real_llama3_3b_merge_points.csv | 1668 - profile_results/real_llama3_by_mesh_dim.svg | 167 - profile_results/real_llama3_by_model_size.svg | 177 - profile_results/real_llama3_dag_analysis.py | 255 - .../real_llama3_optimizer_presolve_3d4d.log | 7 - .../real_llama3_optimizer_sweep.csv | 9 - .../real_llama3_optimizer_sweep.jsonl | 8 - .../real_llama3_optimizer_sweep.log | 54 - .../real_llama3_optimizer_sweep.py | 351 - .../real_llama3_partial_presolve.csv | 3 - profile_results/real_llama3_timeouts.csv | 3 - qwen3_8b_autoparallel_30steps.log | 1 - tests/test_dp_solver.py | 158 + tests/test_lp_relaxation.py | 103 + 22 files changed, 467 insertions(+), 300365 deletions(-) delete mode 100644 profile_results/llama3_3b_ilp_node_indegree_distribution.svg delete mode 100644 profile_results/llama3_8b_4x4_strategy_full.json delete mode 100644 profile_results/llama3_8b_4x4_strategy_summary.json delete mode 100644 profile_results/real_llama3_3b_dag_node_stats.csv delete mode 100644 profile_results/real_llama3_3b_dag_summary.json delete mode 100644 profile_results/real_llama3_3b_merge_points.csv delete mode 100644 profile_results/real_llama3_by_mesh_dim.svg delete mode 100644 profile_results/real_llama3_by_model_size.svg delete mode 100644 profile_results/real_llama3_dag_analysis.py delete mode 100644 profile_results/real_llama3_optimizer_presolve_3d4d.log delete mode 100644 profile_results/real_llama3_optimizer_sweep.csv delete mode 100644 profile_results/real_llama3_optimizer_sweep.jsonl delete mode 100644 profile_results/real_llama3_optimizer_sweep.log delete mode 100644 profile_results/real_llama3_optimizer_sweep.py delete mode 100644 profile_results/real_llama3_partial_presolve.csv delete mode 100644 profile_results/real_llama3_timeouts.csv delete mode 120000 qwen3_8b_autoparallel_30steps.log create mode 100644 tests/test_dp_solver.py create mode 100644 tests/test_lp_relaxation.py diff --git a/.gitignore b/.gitignore index bcaae24d..ff4f7532 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ *.pyc *.pyo *.so +*.log .mypy_cache/ *.egg-info/ @@ -12,5 +13,6 @@ build/ dist/ tmp/ out/ +profile_results/ .vscode/ diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 2b1909ee..06f2a4e6 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -203,6 +203,62 @@ class DecisionVar: input_spec: Any # DTensorSpec +@dataclass +class LPRelaxationResult: + objective: float + status: str + solve_s: float + total_s: float + + +@dataclass +class DPTopology: + nodes: list[torch.fx.Node] + predecessors: dict[torch.fx.Node, list[torch.fx.Node]] + node_to_index: dict[torch.fx.Node, int] + + +class DPBasedShardingSolver: + def __init__(self, optimizer): + self.optimizer = optimizer + self.topology: Optional[DPTopology] = None + + def build_topological_order(self): + nodes = [node for node in self.optimizer.nodes if node.op != "output"] + node_to_index = {node: i for i, node in enumerate(nodes)} + predecessors = {} + + for node in nodes: + node_predecessors = self.optimizer._all_input_nodes(node) + predecessors[node] = node_predecessors + node_index = node_to_index[node] + for pred in node_predecessors: + pred_index = node_to_index.get(pred) + if pred_index is None: + raise RuntimeError( + f"Predecessor {pred} for node {node} is missing from " + "the DP topology" + ) + if pred_index >= node_index: + raise RuntimeError( + f"Predecessor {pred} for node {node} does not appear " + "before it in topological order" + ) + + self.topology = DPTopology( + nodes=nodes, + predecessors=predecessors, + node_to_index=node_to_index, + ) + return self.topology + + def get_solution(self, verbose=False): + raise NotImplementedError( + "DP-based sharding solver only builds topological order today; " + "strategy selection is not implemented yet." + ) + + def _assert_has_tensor_meta(spec_or_specs, node, label): """Assert that all DTensorSpecs in a spec (possibly a tuple) have tensor_meta.""" if isinstance(spec_or_specs, (list, tuple)): @@ -224,8 +280,15 @@ def __init__( mesh, force_grad_reduce_in_higher_precision=False, repeated_subgraphs=False, + solver_backend="ilp", ): self.orig_gm = gm + if solver_backend not in {"ilp", "dp"}: + raise ValueError( + f"Unsupported solver_backend={solver_backend!r}; " + "expected 'ilp' or 'dp'" + ) + self.solver_backend = solver_backend # The optimizer works on a concretized copy of the graph where all # symbolic shapes are replaced with their concrete hint values. This # centralizes dynamic-shape handling: the optimization pipeline @@ -276,6 +339,37 @@ def __init__( get_placement_options_timer().report() self.cluster_links: dict[tuple, tuple] = {} + if self.solver_backend == "dp": + t0 = time.perf_counter() + self.solver = DPBasedShardingSolver(self) + topology = self.solver.build_topological_order() + t1 = time.perf_counter() + self.profile["dp"] = { + "topology_nodes": len(topology.nodes), + "topology_edges": sum( + len(preds) for preds in topology.predecessors.values() + ), + } + self.profile["timings"].update( + { + "topology_construction_s": t1 - t0, + "init_total_s": t1 - t_init_start, + } + ) + logger.info( + "ShardingOptimizer phase profile: phase=dp_topology " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s " + "topology_nodes=%s topology_edges=%s elapsed=%.3fs", + self.profile["mesh"]["shape"], + self.profile["mesh"]["dim_names"], + self.profile["mesh"]["size"], + self._format_billions(self.profile["model"]["parameter_numel"]), + self.profile["dp"]["topology_nodes"], + self.profile["dp"]["topology_edges"], + t1 - t0, + ) + return + if repeated_subgraphs: t = time.time() clusters = get_identical_regions(self.gm.graph, self.strats) @@ -641,14 +735,19 @@ def walk_over_options(self, node, constrain_arg=None): for inp_idx in range(len(strategy.redistribute_cost[argi])): yield argi, out_idx, inp_idx - def _create_pulp_variables(self): - """Create PuLP binary variables for all decision points, resolving - cluster links so that identical nodes share the same variable. + def _create_pulp_variables(self, variable_category=pulp.LpBinary): + """Create PuLP variables for all decision points, resolving cluster + links so that identical nodes share the same variable. Returns a dict mapping root (node_idx, argi, out_idx, inp_idx) keys to their PuLP variables. Linked keys are not stored; use _get_pulp_variable() to resolve them through cluster_links. """ + if variable_category not in {pulp.LpBinary, pulp.LpContinuous}: + raise ValueError( + f"Unsupported variable_category={variable_category!r}; " + "expected pulp.LpBinary or pulp.LpContinuous" + ) cluster_linked_node_idxs = {key[0] for key in self.cluster_links} pulp_variables = {} @@ -661,10 +760,16 @@ def _create_pulp_variables(self): for argi, out_idx, inp_idx in self.walk_over_options(node): key = (node_idx, argi, out_idx, inp_idx) root_node = self.nodes[node_idx] + bounds = ( + {"lowBound": 0, "upBound": 1} + if variable_category == pulp.LpContinuous + else {} + ) pulp_variables[key] = pulp.LpVariable( f"n={root_node},s={node_idx},arg={argi}," f"output_p={out_idx},input_p={inp_idx}", - cat=pulp.LpBinary, + cat=variable_category, + **bounds, ) return pulp_variables @@ -1133,6 +1238,97 @@ def _set_objective(self): terms.append(dv.var * dv.cost * multiplier) self.prob += pulp.lpSum(terms) + def get_lower_bound(self, verbose=False): + """Solve the LP relaxation and return a lower bound on the ILP objective. + + This relaxes the existing binary PuLP variables to continuous variables + in [0, 1], solves the current problem with all constraints already added, + then restores the optimizer state. The result is a certificate only: + fractional LP values are not valid sharding placements. + """ + if self.solver_backend == "dp": + raise NotImplementedError( + "LP relaxation is only available for the PuLP-backed optimizer" + ) + + t0 = time.perf_counter() + old_objective = self.prob.objective + old_status = self.prob.status + old_sol_status = getattr(self.prob, "sol_status", None) + old_selected_keys_marker = object() + old_selected_keys = getattr(self, "selected_keys", old_selected_keys_marker) + var_states = { + var: (var.cat, var.lowBound, var.upBound, var.varValue) + for var in self.pulp_variables.values() + } + + try: + if self.prob.objective is None: + self._set_objective() + + for var in self.pulp_variables.values(): + var.cat = pulp.LpContinuous + var.lowBound = 0 + var.upBound = 1 + var.varValue = None + + solver = pulp.PULP_CBC_CMD(msg=verbose) + t_solve0 = time.perf_counter() + with tempfile.TemporaryDirectory() as tmpdir: + solver.tmpDir = tmpdir + self.prob.solve(solver) + solve_s = time.perf_counter() - t_solve0 + + status = pulp.LpStatus.get(self.prob.status, self.prob.status) + objective = self._safe_float(pulp.value(self.prob.objective)) + result = LPRelaxationResult( + objective=objective, + status=status, + solve_s=solve_s, + total_s=time.perf_counter() - t0, + ) + self.profile["last_lp_relaxation"] = { + "objective": result.objective, + "status": result.status, + "solve_s": result.solve_s, + "total_s": result.total_s, + } + logger.info( + "ShardingOptimizer LP relaxation profile: " + "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s " + "unique_ilp_vars=%s constraints=%s status=%s objective=%.4f " + "timings={solve=%.3fs,total=%.3fs}", + self.profile["mesh"]["shape"], + self.profile["mesh"]["dim_names"], + self.profile["mesh"]["size"], + self._format_billions(self.profile["model"]["parameter_numel"]), + len(self.pulp_variables), + len(self.prob.constraints), + result.status, + result.objective, + result.solve_s, + result.total_s, + ) + return result + finally: + for var, (cat, low_bound, up_bound, value) in var_states.items(): + var.cat = cat + var.lowBound = low_bound + var.upBound = up_bound + var.varValue = value + self.prob.objective = old_objective + self.prob.status = old_status + if old_sol_status is None: + if hasattr(self.prob, "sol_status"): + delattr(self.prob, "sol_status") + else: + self.prob.sol_status = old_sol_status + if old_selected_keys is old_selected_keys_marker: + if hasattr(self, "selected_keys"): + delattr(self, "selected_keys") + else: + self.selected_keys = old_selected_keys + def _solve(self, verbose=False): solver = pulp.PULP_CBC_CMD(msg=verbose) # Use a dedicated temp directory for PuLP's intermediate files (.mps, @@ -1257,6 +1453,9 @@ def _to_concrete_solution(self, solution): return {self._orig_to_concrete[node]: spec for node, spec in solution.items()} def get_solution(self, verbose=False): + if self.solver_backend == "dp": + return self.solver.get_solution(verbose=verbose) + t0 = time.perf_counter() t_objective0 = time.perf_counter() self._set_objective() diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py index 1b31bab8..0dde5b69 100644 --- a/autoparallel/serialization.py +++ b/autoparallel/serialization.py @@ -257,6 +257,7 @@ def load_optimizer(cls, path): opt.strats = strats opt.nodes = list(strats.keys()) opt.node_map = {node: i for i, node in enumerate(opt.nodes)} + opt.solver_backend = "ilp" opt.force_grad_reduce_in_higher_precision = save_dict[ "force_grad_reduce_in_higher_precision" ] diff --git a/profile_results/llama3_3b_ilp_node_indegree_distribution.svg b/profile_results/llama3_3b_ilp_node_indegree_distribution.svg deleted file mode 100644 index d722fd85..00000000 --- a/profile_results/llama3_3b_ilp_node_indegree_distribution.svg +++ /dev/null @@ -1,51 +0,0 @@ - - - -AutoParallel ILP Node In-Degree Distribution -LLaMA3 3B, mesh=(64,), repeated_subgraphs=True; raw optimizer DAG, no manual cluster collapse -Nodes excluding output: 7199; unique direct dependency edges: 8805 - -1 - -10 - -100 - -1000 - -10000 - - -direct dependency nodes / in-degree -node count, log scale - -257 -3.57% -0 - -5275 -73.27% -1 - -1611 -22.38% -2 - -28 -0.39% -3 - -28 -0.39% -8 -Histogram: 0->257, 1->5275, 2->1611, 3->28, 8->28 - \ No newline at end of file diff --git a/profile_results/llama3_8b_4x4_strategy_full.json b/profile_results/llama3_8b_4x4_strategy_full.json deleted file mode 100644 index 88f58ae3..00000000 --- a/profile_results/llama3_8b_4x4_strategy_full.json +++ /dev/null @@ -1,287470 +0,0 @@ -{ - "mesh": { - "dim_names": [ - "dp", - "tp" - ], - "shape": [ - 4, - 4 - ] - }, - "nodes": [ - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "tok_embeddings.weight", - "name": "primals_1", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(1)S(1)", - "shape": [ - 128256, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.attention.wq.weight", - "name": "primals_2", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.attention.wk.weight", - "name": "primals_3", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.attention.wv.weight", - "name": "primals_4", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.attention.wo.weight", - "name": "primals_5", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.feed_forward.w1.weight", - "name": "primals_6", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.feed_forward.w2.weight", - "name": "primals_7", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.feed_forward.w3.weight", - "name": "primals_8", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.attention_norm.weight", - "name": "primals_9", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.0.ffn_norm.weight", - "name": "primals_10", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.attention.wq.weight", - "name": "primals_11", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.attention.wk.weight", - "name": "primals_12", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.attention.wv.weight", - "name": "primals_13", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.attention.wo.weight", - "name": "primals_14", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.feed_forward.w1.weight", - "name": "primals_15", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.feed_forward.w2.weight", - "name": "primals_16", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.feed_forward.w3.weight", - "name": "primals_17", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.attention_norm.weight", - "name": "primals_18", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.1.ffn_norm.weight", - "name": "primals_19", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.attention.wq.weight", - "name": "primals_20", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.attention.wk.weight", - "name": "primals_21", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.attention.wv.weight", - "name": "primals_22", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.attention.wo.weight", - "name": "primals_23", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.feed_forward.w1.weight", - "name": "primals_24", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.feed_forward.w2.weight", - "name": "primals_25", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.feed_forward.w3.weight", - "name": "primals_26", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.attention_norm.weight", - "name": "primals_27", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.2.ffn_norm.weight", - "name": "primals_28", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.attention.wq.weight", - "name": "primals_29", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.attention.wk.weight", - "name": "primals_30", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.attention.wv.weight", - "name": "primals_31", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.attention.wo.weight", - "name": "primals_32", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.feed_forward.w1.weight", - "name": "primals_33", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.feed_forward.w2.weight", - "name": "primals_34", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.feed_forward.w3.weight", - "name": "primals_35", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.attention_norm.weight", - "name": "primals_36", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.3.ffn_norm.weight", - "name": "primals_37", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.attention.wq.weight", - "name": "primals_38", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.attention.wk.weight", - "name": "primals_39", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.attention.wv.weight", - "name": "primals_40", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.attention.wo.weight", - "name": "primals_41", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.feed_forward.w1.weight", - "name": "primals_42", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.feed_forward.w2.weight", - "name": "primals_43", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.feed_forward.w3.weight", - "name": "primals_44", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.attention_norm.weight", - "name": "primals_45", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.4.ffn_norm.weight", - "name": "primals_46", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.attention.wq.weight", - "name": "primals_47", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.attention.wk.weight", - "name": "primals_48", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.attention.wv.weight", - "name": "primals_49", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.attention.wo.weight", - "name": "primals_50", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.feed_forward.w1.weight", - "name": "primals_51", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.feed_forward.w2.weight", - "name": "primals_52", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.feed_forward.w3.weight", - "name": "primals_53", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.attention_norm.weight", - "name": "primals_54", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.5.ffn_norm.weight", - "name": "primals_55", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.attention.wq.weight", - "name": "primals_56", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.attention.wk.weight", - "name": "primals_57", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.attention.wv.weight", - "name": "primals_58", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.attention.wo.weight", - "name": "primals_59", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.feed_forward.w1.weight", - "name": "primals_60", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.feed_forward.w2.weight", - "name": "primals_61", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.feed_forward.w3.weight", - "name": "primals_62", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.attention_norm.weight", - "name": "primals_63", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.6.ffn_norm.weight", - "name": "primals_64", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.attention.wq.weight", - "name": "primals_65", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.attention.wk.weight", - "name": "primals_66", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.attention.wv.weight", - "name": "primals_67", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.attention.wo.weight", - "name": "primals_68", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.feed_forward.w1.weight", - "name": "primals_69", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.feed_forward.w2.weight", - "name": "primals_70", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.feed_forward.w3.weight", - "name": "primals_71", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.attention_norm.weight", - "name": "primals_72", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.7.ffn_norm.weight", - "name": "primals_73", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.attention.wq.weight", - "name": "primals_74", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.attention.wk.weight", - "name": "primals_75", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.attention.wv.weight", - "name": "primals_76", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.attention.wo.weight", - "name": "primals_77", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.feed_forward.w1.weight", - "name": "primals_78", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.feed_forward.w2.weight", - "name": "primals_79", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.feed_forward.w3.weight", - "name": "primals_80", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.attention_norm.weight", - "name": "primals_81", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.8.ffn_norm.weight", - "name": "primals_82", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.attention.wq.weight", - "name": "primals_83", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.attention.wk.weight", - "name": "primals_84", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.attention.wv.weight", - "name": "primals_85", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.attention.wo.weight", - "name": "primals_86", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.feed_forward.w1.weight", - "name": "primals_87", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.feed_forward.w2.weight", - "name": "primals_88", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.feed_forward.w3.weight", - "name": "primals_89", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.attention_norm.weight", - "name": "primals_90", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.9.ffn_norm.weight", - "name": "primals_91", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.attention.wq.weight", - "name": "primals_92", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.attention.wk.weight", - "name": "primals_93", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.attention.wv.weight", - "name": "primals_94", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.attention.wo.weight", - "name": "primals_95", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.feed_forward.w1.weight", - "name": "primals_96", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.feed_forward.w2.weight", - "name": "primals_97", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.feed_forward.w3.weight", - "name": "primals_98", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.attention_norm.weight", - "name": "primals_99", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.10.ffn_norm.weight", - "name": "primals_100", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.attention.wq.weight", - "name": "primals_101", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.attention.wk.weight", - "name": "primals_102", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.attention.wv.weight", - "name": "primals_103", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.attention.wo.weight", - "name": "primals_104", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.feed_forward.w1.weight", - "name": "primals_105", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.feed_forward.w2.weight", - "name": "primals_106", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.feed_forward.w3.weight", - "name": "primals_107", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.attention_norm.weight", - "name": "primals_108", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.11.ffn_norm.weight", - "name": "primals_109", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.attention.wq.weight", - "name": "primals_110", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.attention.wk.weight", - "name": "primals_111", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.attention.wv.weight", - "name": "primals_112", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.attention.wo.weight", - "name": "primals_113", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.feed_forward.w1.weight", - "name": "primals_114", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.feed_forward.w2.weight", - "name": "primals_115", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.feed_forward.w3.weight", - "name": "primals_116", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.attention_norm.weight", - "name": "primals_117", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.12.ffn_norm.weight", - "name": "primals_118", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.attention.wq.weight", - "name": "primals_119", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.attention.wk.weight", - "name": "primals_120", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.attention.wv.weight", - "name": "primals_121", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.attention.wo.weight", - "name": "primals_122", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.feed_forward.w1.weight", - "name": "primals_123", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.feed_forward.w2.weight", - "name": "primals_124", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.feed_forward.w3.weight", - "name": "primals_125", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.attention_norm.weight", - "name": "primals_126", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.13.ffn_norm.weight", - "name": "primals_127", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.attention.wq.weight", - "name": "primals_128", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.attention.wk.weight", - "name": "primals_129", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.attention.wv.weight", - "name": "primals_130", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.attention.wo.weight", - "name": "primals_131", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.feed_forward.w1.weight", - "name": "primals_132", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.feed_forward.w2.weight", - "name": "primals_133", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.feed_forward.w3.weight", - "name": "primals_134", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.attention_norm.weight", - "name": "primals_135", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.14.ffn_norm.weight", - "name": "primals_136", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.attention.wq.weight", - "name": "primals_137", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.attention.wk.weight", - "name": "primals_138", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.attention.wv.weight", - "name": "primals_139", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.attention.wo.weight", - "name": "primals_140", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.feed_forward.w1.weight", - "name": "primals_141", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.feed_forward.w2.weight", - "name": "primals_142", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.feed_forward.w3.weight", - "name": "primals_143", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.attention_norm.weight", - "name": "primals_144", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.15.ffn_norm.weight", - "name": "primals_145", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.attention.wq.weight", - "name": "primals_146", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.attention.wk.weight", - "name": "primals_147", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.attention.wv.weight", - "name": "primals_148", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.attention.wo.weight", - "name": "primals_149", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.feed_forward.w1.weight", - "name": "primals_150", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.feed_forward.w2.weight", - "name": "primals_151", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.feed_forward.w3.weight", - "name": "primals_152", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.attention_norm.weight", - "name": "primals_153", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.16.ffn_norm.weight", - "name": "primals_154", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.attention.wq.weight", - "name": "primals_155", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.attention.wk.weight", - "name": "primals_156", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.attention.wv.weight", - "name": "primals_157", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.attention.wo.weight", - "name": "primals_158", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.feed_forward.w1.weight", - "name": "primals_159", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.feed_forward.w2.weight", - "name": "primals_160", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.feed_forward.w3.weight", - "name": "primals_161", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.attention_norm.weight", - "name": "primals_162", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.17.ffn_norm.weight", - "name": "primals_163", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.attention.wq.weight", - "name": "primals_164", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.attention.wk.weight", - "name": "primals_165", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.attention.wv.weight", - "name": "primals_166", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.attention.wo.weight", - "name": "primals_167", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.feed_forward.w1.weight", - "name": "primals_168", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.feed_forward.w2.weight", - "name": "primals_169", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.feed_forward.w3.weight", - "name": "primals_170", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.attention_norm.weight", - "name": "primals_171", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.18.ffn_norm.weight", - "name": "primals_172", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.attention.wq.weight", - "name": "primals_173", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.attention.wk.weight", - "name": "primals_174", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.attention.wv.weight", - "name": "primals_175", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.attention.wo.weight", - "name": "primals_176", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.feed_forward.w1.weight", - "name": "primals_177", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.feed_forward.w2.weight", - "name": "primals_178", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.feed_forward.w3.weight", - "name": "primals_179", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.attention_norm.weight", - "name": "primals_180", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.19.ffn_norm.weight", - "name": "primals_181", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.attention.wq.weight", - "name": "primals_182", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.attention.wk.weight", - "name": "primals_183", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.attention.wv.weight", - "name": "primals_184", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.attention.wo.weight", - "name": "primals_185", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.feed_forward.w1.weight", - "name": "primals_186", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.feed_forward.w2.weight", - "name": "primals_187", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.feed_forward.w3.weight", - "name": "primals_188", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.attention_norm.weight", - "name": "primals_189", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.20.ffn_norm.weight", - "name": "primals_190", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.attention.wq.weight", - "name": "primals_191", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.attention.wk.weight", - "name": "primals_192", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.attention.wv.weight", - "name": "primals_193", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.attention.wo.weight", - "name": "primals_194", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.feed_forward.w1.weight", - "name": "primals_195", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.feed_forward.w2.weight", - "name": "primals_196", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.feed_forward.w3.weight", - "name": "primals_197", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.attention_norm.weight", - "name": "primals_198", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.21.ffn_norm.weight", - "name": "primals_199", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.attention.wq.weight", - "name": "primals_200", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.attention.wk.weight", - "name": "primals_201", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.attention.wv.weight", - "name": "primals_202", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.attention.wo.weight", - "name": "primals_203", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.feed_forward.w1.weight", - "name": "primals_204", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.feed_forward.w2.weight", - "name": "primals_205", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.feed_forward.w3.weight", - "name": "primals_206", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.attention_norm.weight", - "name": "primals_207", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.22.ffn_norm.weight", - "name": "primals_208", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.attention.wq.weight", - "name": "primals_209", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.attention.wk.weight", - "name": "primals_210", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.attention.wv.weight", - "name": "primals_211", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.attention.wo.weight", - "name": "primals_212", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.feed_forward.w1.weight", - "name": "primals_213", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.feed_forward.w2.weight", - "name": "primals_214", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.feed_forward.w3.weight", - "name": "primals_215", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.attention_norm.weight", - "name": "primals_216", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.23.ffn_norm.weight", - "name": "primals_217", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.attention.wq.weight", - "name": "primals_218", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.attention.wk.weight", - "name": "primals_219", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.attention.wv.weight", - "name": "primals_220", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.attention.wo.weight", - "name": "primals_221", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.feed_forward.w1.weight", - "name": "primals_222", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.feed_forward.w2.weight", - "name": "primals_223", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.feed_forward.w3.weight", - "name": "primals_224", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.attention_norm.weight", - "name": "primals_225", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.24.ffn_norm.weight", - "name": "primals_226", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.attention.wq.weight", - "name": "primals_227", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.attention.wk.weight", - "name": "primals_228", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.attention.wv.weight", - "name": "primals_229", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.attention.wo.weight", - "name": "primals_230", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.feed_forward.w1.weight", - "name": "primals_231", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.feed_forward.w2.weight", - "name": "primals_232", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.feed_forward.w3.weight", - "name": "primals_233", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.attention_norm.weight", - "name": "primals_234", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.25.ffn_norm.weight", - "name": "primals_235", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.attention.wq.weight", - "name": "primals_236", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.attention.wk.weight", - "name": "primals_237", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.attention.wv.weight", - "name": "primals_238", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.attention.wo.weight", - "name": "primals_239", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.feed_forward.w1.weight", - "name": "primals_240", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.feed_forward.w2.weight", - "name": "primals_241", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.feed_forward.w3.weight", - "name": "primals_242", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.attention_norm.weight", - "name": "primals_243", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.26.ffn_norm.weight", - "name": "primals_244", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.attention.wq.weight", - "name": "primals_245", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.attention.wk.weight", - "name": "primals_246", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.attention.wv.weight", - "name": "primals_247", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.attention.wo.weight", - "name": "primals_248", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.feed_forward.w1.weight", - "name": "primals_249", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.feed_forward.w2.weight", - "name": "primals_250", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.feed_forward.w3.weight", - "name": "primals_251", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.attention_norm.weight", - "name": "primals_252", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.27.ffn_norm.weight", - "name": "primals_253", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.attention.wq.weight", - "name": "primals_254", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.attention.wk.weight", - "name": "primals_255", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.attention.wv.weight", - "name": "primals_256", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.attention.wo.weight", - "name": "primals_257", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.feed_forward.w1.weight", - "name": "primals_258", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.feed_forward.w2.weight", - "name": "primals_259", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.feed_forward.w3.weight", - "name": "primals_260", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.attention_norm.weight", - "name": "primals_261", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.28.ffn_norm.weight", - "name": "primals_262", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.attention.wq.weight", - "name": "primals_263", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.attention.wk.weight", - "name": "primals_264", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.attention.wv.weight", - "name": "primals_265", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.attention.wo.weight", - "name": "primals_266", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.feed_forward.w1.weight", - "name": "primals_267", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.feed_forward.w2.weight", - "name": "primals_268", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.feed_forward.w3.weight", - "name": "primals_269", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.attention_norm.weight", - "name": "primals_270", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.29.ffn_norm.weight", - "name": "primals_271", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.attention.wq.weight", - "name": "primals_272", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.attention.wk.weight", - "name": "primals_273", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.attention.wv.weight", - "name": "primals_274", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.attention.wo.weight", - "name": "primals_275", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.feed_forward.w1.weight", - "name": "primals_276", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.feed_forward.w2.weight", - "name": "primals_277", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.feed_forward.w3.weight", - "name": "primals_278", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.attention_norm.weight", - "name": "primals_279", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.30.ffn_norm.weight", - "name": "primals_280", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.attention.wq.weight", - "name": "primals_281", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.attention.wk.weight", - "name": "primals_282", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.attention.wv.weight", - "name": "primals_283", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.attention.wo.weight", - "name": "primals_284", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.feed_forward.w1.weight", - "name": "primals_285", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.feed_forward.w2.weight", - "name": "primals_286", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.feed_forward.w3.weight", - "name": "primals_287", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.attention_norm.weight", - "name": "primals_288", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "layers.31.ffn_norm.weight", - "name": "primals_289", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "norm.weight", - "name": "primals_290", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [], - "module_path": "output.weight", - "name": "primals_291", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "param", - "placement": "S(0)S(0)", - "shape": [ - 128256, - 4096 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [], - "module_path": "freqs_cis", - "name": "primals_292", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "buffer", - "placement": "RR", - "shape": [ - 8192, - 64 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "int64", - "inputs": [], - "name": "primals_293", - "op": "placeholder", - "phase": "forward", - "placeholder_kind": "input", - "placement": "S(0)R", - "shape": [ - 8, - 8192 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [], - "name": "tangents_1", - "op": "placeholder", - "phase": "backward", - "placeholder_kind": "tangent", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 128256 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 76.40578345195063, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(1)S(1)", - "name": "primals_1", - "src_placement": "S(1)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "dtype_cast", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(1)S(1)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "int64", - "inputs": [ - { - "comm_cost": 21.38246153846154, - "dst_placement": "RR", - "name": "primals_293", - "src_placement": "S(0)R", - "transition_cost": 1 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "alias_default_1", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 8, - 8192 - ], - "transition_cost": 1.0 - }, - { - "compute_cost": 38.685829146330285, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(1)S(1)", - "name": "dtype_cast", - "src_placement": "S(1)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_1", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "embedding", - "op": "aten.embedding.default", - "phase": "forward", - "placement": "S(2)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 539 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 0, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_9", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "dtype_cast_1", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 706.2108351658422, - "dst_placement": "S(0)S(1)", - "name": "embedding", - "src_placement": "S(2)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "alias_default_3", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 539 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 1, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_5", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "pow_1", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mean", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "add", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "rsqrt", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_6", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_1", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_4", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_4", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_1", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type_1", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_2", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "dtype_cast_2", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_2", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "permute", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_1", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_7", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "alias_default_8", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_7", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_8", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "einsum_default", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_3", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "dtype_cast_3", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_3", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "permute_1", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "alias_default_9", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_7", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_9", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "einsum_default_1", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_4", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "dtype_cast_4", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_4", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "permute_2", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_2", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "alias_default_10", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_7", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_10", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "einsum_default_2", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_6", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_7", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_8", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_8", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_9", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_complex", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_9", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_10", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_complex_1", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "primals_292", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "name": "alias_default", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 8192, - 64 - ], - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_11", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_11", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "mul_2", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_real", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_12", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "mul_3", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_real_1", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_13", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_10", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_11", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "unsqueeze", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "expand", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "clone", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_14", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "unsqueeze_1", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "expand_1", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "clone_1", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_15", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_3", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_4", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_5", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_12", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_13", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_14", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "_scaled_dot_product_flash_attention", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem_1", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem_6", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem_7", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "alias_default_15", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_6", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_6", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_16", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_5", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "dtype_cast_5", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_5", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "permute_7", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_16", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_7", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "alias_default_17", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_17", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "einsum_default_3", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0", - "name": "add_1", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_10", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "dtype_cast_6", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0", - "name": "alias_default_18", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_14", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_20", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "pow_2", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mean_1", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "add_2", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "rsqrt_1", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_21", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_4", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_6", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_19", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_19", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_5", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_15", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_6", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "dtype_cast_7", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_7", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "permute_8", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_15", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_22", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_8", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "alias_default_23", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_22", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_23", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "einsum_default_4", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "alias_default_24", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "convert_element_type_18", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_25", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "neg", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "exp", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "add_3", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "div", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "convert_element_type_19", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_8", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "dtype_cast_8", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_8", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "permute_9", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_9", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "alias_default_27", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_22", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_27", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "einsum_default_5", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_26", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "alias_default_28", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_6", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "dtype_cast_9", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_9", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "permute_10", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_29", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_10", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "alias_default_30", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_30", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "einsum_default_6", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_6", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0", - "name": "add_4", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_18", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "dtype_cast_10", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0", - "name": "alias_default_31", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_24", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_33", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "pow_3", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mean_2", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "add_5", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "rsqrt_2", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_34", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_7", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_10", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_32", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_32", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_8", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_25", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_11", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "dtype_cast_11", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_11", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "permute_11", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_25", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_35", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_11", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "alias_default_36", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_35", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_36", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "einsum_default_7", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_12", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "dtype_cast_12", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_12", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "permute_12", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_12", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "alias_default_37", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_35", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_37", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "einsum_default_8", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_13", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "dtype_cast_13", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_13", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "permute_13", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_13", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "alias_default_38", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_35", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_38", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "einsum_default_9", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_31", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_32", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_33", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_32", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_34", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_complex_2", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_33", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_35", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_complex_3", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_36", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_36", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_39", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_39", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "mul_9", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_real_2", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_37", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_39", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "mul_10", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_real_3", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_38", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_34", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_35", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "unsqueeze_2", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "expand_2", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "clone_2", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_39", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "unsqueeze_3", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "expand_3", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "clone_3", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_40", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_14", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_15", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_16", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_40", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_41", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_42", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_1", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_9", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_10", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_1", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_15", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_1", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_16", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "alias_default_43", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_17", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_17", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_41", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_14", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "dtype_cast_14", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_14", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "permute_18", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_44", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_18", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "alias_default_45", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_45", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "einsum_default_10", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1", - "name": "add_6", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_19", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "dtype_cast_15", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1", - "name": "alias_default_46", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_38", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_48", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "pow_4", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mean_3", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "add_7", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "rsqrt_3", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_49", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_11", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_15", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_47", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_47", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_12", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_39", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_15", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "dtype_cast_16", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_16", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "permute_19", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_39", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_50", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_19", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "alias_default_51", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_50", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_51", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "einsum_default_11", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "alias_default_52", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "convert_element_type_42", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_53", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "neg_1", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "exp_1", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "add_8", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "div_1", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "convert_element_type_43", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_17", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "dtype_cast_17", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_17", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "permute_20", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_20", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "alias_default_55", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_50", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_55", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "einsum_default_12", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_54", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "alias_default_56", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_13", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "dtype_cast_18", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_18", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "permute_21", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_57", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_21", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "alias_default_58", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_58", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "einsum_default_13", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_13", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1", - "name": "add_9", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_27", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "dtype_cast_19", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1", - "name": "alias_default_59", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_48", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_61", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "pow_5", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mean_4", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "add_10", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "rsqrt_4", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_62", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_14", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_19", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_60", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_60", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_15", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_49", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_20", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "dtype_cast_20", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_20", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "permute_22", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_49", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_63", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_22", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "alias_default_64", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_63", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_64", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "einsum_default_14", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_21", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "dtype_cast_21", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_21", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "permute_23", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_23", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "alias_default_65", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_63", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_65", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "einsum_default_15", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_22", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "dtype_cast_22", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_22", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "permute_24", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_24", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "alias_default_66", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_63", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_66", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "einsum_default_16", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_56", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_57", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_58", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_56", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_59", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_complex_4", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_57", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_60", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_complex_5", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_61", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_61", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_67", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_67", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "mul_16", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_real_4", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_62", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_67", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "mul_17", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_real_5", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_63", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_58", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_59", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "unsqueeze_4", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "expand_4", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "clone_4", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_64", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "unsqueeze_5", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "expand_5", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "clone_5", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_65", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_25", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_64", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_26", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_65", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_27", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_68", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_69", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_70", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_68", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_2", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_18", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_19", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_2", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_24", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_2", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_25", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "alias_default_71", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_28", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_28", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_66", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_23", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "dtype_cast_23", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_23", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "permute_29", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_66", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_72", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_29", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "alias_default_73", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_73", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "einsum_default_17", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2", - "name": "add_11", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_28", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "dtype_cast_24", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2", - "name": "alias_default_74", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_62", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_76", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_76", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "pow_6", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mean_5", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "add_12", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "rsqrt_5", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_77", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_76", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_18", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_24", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_75", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_75", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_19", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_63", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_24", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "dtype_cast_25", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_25", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "permute_30", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_63", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_78", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_30", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "alias_default_79", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_78", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_79", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "einsum_default_18", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "alias_default_80", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_80", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "convert_element_type_66", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_66", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_81", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "neg_2", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "exp_2", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "add_13", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "div_2", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "convert_element_type_67", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_26", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "dtype_cast_26", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_26", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "permute_31", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_31", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "alias_default_83", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_78", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_83", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "einsum_default_19", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_67", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_82", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "alias_default_84", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_20", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "dtype_cast_27", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_27", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "permute_32", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_85", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_32", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "alias_default_86", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_86", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "einsum_default_20", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_20", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2", - "name": "add_14", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_36", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "dtype_cast_28", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2", - "name": "alias_default_87", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_72", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_89", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_89", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "pow_7", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mean_6", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "add_15", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "rsqrt_6", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_90", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_89", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_21", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_28", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_88", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_88", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_22", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_73", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_29", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "dtype_cast_29", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_29", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "permute_33", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_73", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_91", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_33", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "alias_default_92", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_91", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_92", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "einsum_default_21", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_30", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "dtype_cast_30", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_30", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "permute_34", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_34", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "alias_default_93", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_91", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_93", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "einsum_default_22", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_31", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "dtype_cast_31", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_31", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "permute_35", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_35", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "alias_default_94", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_91", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_94", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "einsum_default_23", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_81", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_82", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_83", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_80", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_80", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_84", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_complex_6", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_81", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_85", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_complex_7", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_86", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_86", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_95", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_95", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "mul_23", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_real_6", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_87", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_95", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "mul_24", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_real_7", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_88", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_87", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_82", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_88", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_83", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "unsqueeze_6", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "expand_6", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "clone_6", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_89", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "unsqueeze_7", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "expand_7", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "clone_7", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_90", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_36", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_89", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_37", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_90", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_38", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_96", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_97", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_98", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_97", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_98", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_3", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_27", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_28", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_3", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_33", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_3", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_34", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "alias_default_99", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_99", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_39", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_39", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_91", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_32", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "dtype_cast_32", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_32", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "permute_40", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_100", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_40", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "alias_default_101", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_100", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_101", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "einsum_default_24", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3", - "name": "add_16", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_37", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "dtype_cast_33", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3", - "name": "alias_default_102", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_86", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_86", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_104", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_104", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "pow_8", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mean_7", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "add_17", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "rsqrt_7", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_105", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_104", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_105", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_25", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_33", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_103", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_103", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_26", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_87", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_33", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "dtype_cast_34", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_34", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "permute_41", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_87", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_106", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_41", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "alias_default_107", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_106", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_107", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "einsum_default_25", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "alias_default_108", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "convert_element_type_90", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_90", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_109", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "neg_3", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "exp_3", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "add_18", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "div_3", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "convert_element_type_91", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_35", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "dtype_cast_35", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_35", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "permute_42", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_42", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "alias_default_111", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_106", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_111", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "einsum_default_26", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_91", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_110", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "alias_default_112", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_27", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "dtype_cast_36", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_36", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "permute_43", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_113", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_43", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "alias_default_114", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_114", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "einsum_default_27", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_27", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3", - "name": "add_19", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_45", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "dtype_cast_37", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3", - "name": "alias_default_115", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_96", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_117", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_117", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "pow_9", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mean_8", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "add_20", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "rsqrt_8", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_118", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_117", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_118", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_28", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_37", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_116", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_116", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_29", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_97", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_38", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "dtype_cast_38", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_38", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "permute_44", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_97", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_119", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_44", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "alias_default_120", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_119", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_120", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "einsum_default_28", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_39", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "dtype_cast_39", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_39", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "permute_45", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_45", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "alias_default_121", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_119", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_121", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "einsum_default_29", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_40", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "dtype_cast_40", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_40", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "permute_46", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_46", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "alias_default_122", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_119", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_122", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "einsum_default_30", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_106", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_107", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_108", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_104", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_109", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_complex_8", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_105", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_110", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_complex_9", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_111", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_111", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_123", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_123", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "mul_30", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_real_8", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_112", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_123", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "mul_31", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_real_9", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_113", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_106", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_107", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "unsqueeze_8", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "expand_8", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "clone_8", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_114", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "unsqueeze_9", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "expand_9", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "clone_9", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_115", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_47", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_48", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_115", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_49", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_124", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_125", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_126", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_4", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_36", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_37", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_4", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_42", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_4", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_43", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "alias_default_127", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_50", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_50", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_116", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_41", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "dtype_cast_41", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_41", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "permute_51", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_116", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_128", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_51", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "alias_default_129", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_128", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_129", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "einsum_default_31", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4", - "name": "add_21", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_46", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "dtype_cast_42", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4", - "name": "alias_default_130", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_110", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_110", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_132", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_132", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "pow_10", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mean_9", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "add_22", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "rsqrt_9", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_133", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_132", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_133", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_32", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_42", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_131", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_131", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_33", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_111", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_42", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "dtype_cast_43", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_43", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "permute_52", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_111", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_134", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_52", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "alias_default_135", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_134", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_135", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "einsum_default_32", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "alias_default_136", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_136", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "convert_element_type_114", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_137", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "neg_4", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "exp_4", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "add_23", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "div_4", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "convert_element_type_115", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_44", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "dtype_cast_44", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_44", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "permute_53", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_53", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "alias_default_139", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_134", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_139", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "einsum_default_33", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_115", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_138", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "alias_default_140", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_140", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_34", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "dtype_cast_45", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_45", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "permute_54", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_141", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_54", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "alias_default_142", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_141", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_142", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "einsum_default_34", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_34", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4", - "name": "add_24", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_54", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "dtype_cast_46", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4", - "name": "alias_default_143", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_143", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_120", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_120", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_145", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "pow_11", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mean_10", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "add_25", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "rsqrt_10", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_146", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_35", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_46", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_144", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_144", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_36", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_121", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_47", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "dtype_cast_47", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_47", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "permute_55", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_121", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_147", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_55", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "alias_default_148", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_147", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_148", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "einsum_default_35", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_48", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "dtype_cast_48", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_48", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "permute_56", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_56", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "alias_default_149", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_147", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_149", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "einsum_default_36", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_49", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "dtype_cast_49", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_49", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "permute_57", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_57", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "alias_default_150", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_147", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_150", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "einsum_default_37", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_131", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_132", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_133", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_131", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_128", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_128", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_134", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_134", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_complex_10", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_132", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_129", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_129", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_135", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_135", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_complex_11", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_136", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_136", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_151", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "mul_37", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_real_10", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_137", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "mul_38", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_real_11", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_138", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_130", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_131", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_131", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "unsqueeze_10", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "expand_10", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "clone_10", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_139", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_133", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "unsqueeze_11", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "expand_11", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "clone_11", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_140", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_130", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_58", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_139", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_59", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_140", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_60", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_58", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_152", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_153", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_154", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_152", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_153", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_5", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_45", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_46", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_5", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_51", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_5", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_52", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "alias_default_155", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_155", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_61", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_61", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_141", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_50", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "dtype_cast_50", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_50", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "permute_62", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_156", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_62", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "alias_default_157", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_157", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "einsum_default_38", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_143", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5", - "name": "add_26", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_55", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "dtype_cast_51", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5", - "name": "alias_default_158", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_158", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_134", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_134", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_160", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_160", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "pow_12", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mean_11", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "add_27", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "rsqrt_11", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_161", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_160", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_161", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_39", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_51", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_159", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_159", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_40", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_135", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_51", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "dtype_cast_52", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_52", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "permute_63", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_135", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_162", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_63", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "alias_default_163", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_162", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_163", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "einsum_default_39", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "alias_default_164", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_164", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "convert_element_type_138", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_165", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "neg_5", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "exp_5", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "add_28", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "div_5", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "convert_element_type_139", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_53", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "dtype_cast_53", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_53", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "permute_64", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_64", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "alias_default_167", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_162", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_167", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "einsum_default_40", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_139", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_166", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "alias_default_168", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_166", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_41", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "dtype_cast_54", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_54", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "permute_65", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_169", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_65", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "alias_default_170", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_169", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_170", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "einsum_default_41", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_158", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_41", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5", - "name": "add_29", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_63", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "dtype_cast_55", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5", - "name": "alias_default_171", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_144", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_144", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_173", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "pow_13", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mean_12", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "add_30", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "rsqrt_12", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_174", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_174", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_42", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_55", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_172", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_172", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_43", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_145", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_56", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "dtype_cast_56", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_56", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "permute_66", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_145", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_175", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_66", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "alias_default_176", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_175", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_176", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "einsum_default_42", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_57", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "dtype_cast_57", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_57", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "permute_67", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_67", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "alias_default_177", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_175", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_177", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "einsum_default_43", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_58", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "dtype_cast_58", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_58", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "permute_68", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_68", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "alias_default_178", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_175", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_178", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "einsum_default_44", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_156", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_157", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_158", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_156", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_152", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_152", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_159", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_159", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_complex_12", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_157", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_153", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_153", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_160", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_160", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_complex_13", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_161", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_161", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_179", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_179", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "mul_44", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_real_12", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_162", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_179", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "mul_45", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_real_13", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_163", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_162", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_154", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_163", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_155", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_155", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "unsqueeze_12", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "expand_12", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "clone_12", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_164", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_158", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "unsqueeze_13", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "expand_13", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "clone_13", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_165", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_154", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_69", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_164", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_70", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_71", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_180", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_181", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_182", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_180", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_181", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_182", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_6", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_54", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_55", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_6", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_60", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_6", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_61", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "alias_default_183", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_183", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_72", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_72", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_166", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_59", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "dtype_cast_59", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_59", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "permute_73", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_166", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_184", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_73", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "alias_default_185", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_184", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_185", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "einsum_default_45", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6", - "name": "add_31", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_64", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "dtype_cast_60", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6", - "name": "alias_default_186", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_158", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_158", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_188", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_188", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "pow_14", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mean_13", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "add_32", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "rsqrt_13", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_189", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_188", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_46", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_60", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_187", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_187", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_47", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_159", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_60", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "dtype_cast_61", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_61", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "permute_74", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_159", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_190", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_74", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "alias_default_191", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_190", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_191", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "einsum_default_46", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "alias_default_192", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_192", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "convert_element_type_162", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_162", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_193", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_193", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "neg_6", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "exp_6", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "add_33", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_193", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "div_6", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "convert_element_type_163", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_62", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "dtype_cast_62", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_62", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "permute_75", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_75", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "alias_default_195", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_190", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_195", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "einsum_default_47", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_163", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_194", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "alias_default_196", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_194", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_196", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_48", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "dtype_cast_63", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_63", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "permute_76", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_197", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_76", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "alias_default_198", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_198", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "einsum_default_48", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_48", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6", - "name": "add_34", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_72", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "dtype_cast_64", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6", - "name": "alias_default_199", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_199", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_168", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_168", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_201", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_201", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "pow_15", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mean_14", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "add_35", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "rsqrt_14", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_202", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_201", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_49", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_64", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_200", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_200", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_50", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_169", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_65", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "dtype_cast_65", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_65", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "permute_77", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_169", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_203", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_77", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "alias_default_204", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_203", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_204", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "einsum_default_49", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_66", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "dtype_cast_66", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_66", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "permute_78", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_78", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "alias_default_205", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_203", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_205", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "einsum_default_50", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_67", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "dtype_cast_67", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_67", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "permute_79", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_79", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "alias_default_206", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_203", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_206", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "einsum_default_51", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_181", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_182", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_183", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_181", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_176", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_176", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_184", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_184", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_complex_14", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_182", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_177", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_177", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_185", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_185", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_complex_15", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_186", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_186", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_207", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_207", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "mul_51", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_real_14", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_187", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_207", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "mul_52", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_real_15", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_188", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_187", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_178", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_188", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_179", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_179", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "unsqueeze_14", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "expand_14", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "clone_14", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_189", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_183", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "unsqueeze_15", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "expand_15", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "clone_15", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_190", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_178", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_80", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_189", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_81", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_190", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_82", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_80", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_208", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_209", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_82", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_210", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_208", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_209", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_210", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_7", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_63", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_64", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_7", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_69", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_7", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_70", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "alias_default_211", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_211", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_83", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_83", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_191", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_68", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "dtype_cast_68", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_68", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "permute_84", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_191", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_212", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_84", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "alias_default_213", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_212", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_213", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "einsum_default_52", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_199", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7", - "name": "add_36", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_73", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "dtype_cast_69", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7", - "name": "alias_default_214", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_182", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_182", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_216", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_216", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "pow_16", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mean_15", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "add_37", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "rsqrt_15", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_217", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_216", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_217", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_53", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_69", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_215", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_53", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_215", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_54", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_183", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_69", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "dtype_cast_70", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_70", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "permute_85", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_183", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_218", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_85", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "alias_default_219", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_218", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_219", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "einsum_default_53", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "alias_default_220", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_220", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "convert_element_type_186", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_186", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_221", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_221", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "neg_7", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "exp_7", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "add_38", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_221", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "div_7", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "convert_element_type_187", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_71", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "dtype_cast_71", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_71", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "permute_86", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_86", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "alias_default_223", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_218", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_223", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "einsum_default_54", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_187", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_222", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "alias_default_224", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_222", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_55", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "dtype_cast_72", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_72", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "permute_87", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_225", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_87", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "alias_default_226", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_225", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_226", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "einsum_default_55", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_55", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7", - "name": "add_39", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_81", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "dtype_cast_73", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7", - "name": "alias_default_227", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_192", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_192", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_229", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_229", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "pow_17", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mean_16", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "add_40", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "rsqrt_16", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_230", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_229", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_230", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_56", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_73", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_228", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_228", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_57", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_193", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_74", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "dtype_cast_74", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_74", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "permute_88", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_193", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_231", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_88", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "alias_default_232", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_231", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_232", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "einsum_default_56", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_75", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "dtype_cast_75", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_75", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "permute_89", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_89", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "alias_default_233", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_231", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_233", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "einsum_default_57", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_76", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "dtype_cast_76", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_76", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "permute_90", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_90", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "alias_default_234", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_231", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_234", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "einsum_default_58", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_206", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_207", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_208", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_206", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_200", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_200", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_209", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_209", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_complex_16", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_207", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_201", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_201", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_210", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_210", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_complex_17", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_211", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_211", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_235", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_235", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "mul_58", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_real_16", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_212", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_235", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "mul_59", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_real_17", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_213", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_212", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_202", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_213", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_203", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_203", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "unsqueeze_16", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "expand_16", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "clone_16", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_214", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_208", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "unsqueeze_17", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "expand_17", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "clone_17", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_215", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_202", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_91", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_214", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_92", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_215", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_93", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_236", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_92", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_237", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_93", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_238", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_237", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_238", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_8", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_72", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_73", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_8", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_78", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_8", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_79", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "alias_default_239", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_239", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_94", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_94", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_216", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_77", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "dtype_cast_77", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_77", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "permute_95", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_216", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_240", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_95", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "alias_default_241", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_240", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_241", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "einsum_default_59", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8", - "name": "add_41", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_82", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "dtype_cast_78", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8", - "name": "alias_default_242", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_206", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_206", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_244", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_244", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "pow_18", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mean_17", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "add_42", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "rsqrt_17", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_245", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_244", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_245", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_60", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_78", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_243", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_243", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_61", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_207", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_78", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "dtype_cast_79", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_79", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "permute_96", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_207", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_246", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_96", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "alias_default_247", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_246", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_247", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "einsum_default_60", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "alias_default_248", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_248", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "convert_element_type_210", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_210", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_249", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_249", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "neg_8", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "exp_8", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "add_43", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_249", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "div_8", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "convert_element_type_211", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_80", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "dtype_cast_80", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_80", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "permute_97", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_97", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "alias_default_251", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_246", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_251", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "einsum_default_61", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_211", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_250", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "alias_default_252", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_250", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_252", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_62", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_79", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "dtype_cast_81", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_81", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "permute_98", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_253", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_98", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "alias_default_254", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_253", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_254", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "einsum_default_62", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_62", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8", - "name": "add_44", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_90", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "dtype_cast_82", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8", - "name": "alias_default_255", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_255", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_216", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_216", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_257", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_257", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "pow_19", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mean_18", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "add_45", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "rsqrt_18", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_258", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_257", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_63", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_82", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_256", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_256", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_64", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_217", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_83", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "dtype_cast_83", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_83", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "permute_99", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_217", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_259", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_99", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "alias_default_260", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_259", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_260", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "einsum_default_63", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_84", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "dtype_cast_84", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_84", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "permute_100", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_100", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "alias_default_261", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_259", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_261", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "einsum_default_64", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_85", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "dtype_cast_85", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_85", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "permute_101", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_101", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "alias_default_262", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_259", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_262", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "einsum_default_65", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_231", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_64", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_232", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_65", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_233", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_231", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_224", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_234", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_234", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_complex_18", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_232", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_225", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_225", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_235", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_235", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_complex_19", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_236", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_236", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_263", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_263", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "mul_65", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_65", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_real_18", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_237", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_263", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "mul_66", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_66", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_real_19", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_238", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_237", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_226", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_238", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_227", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_227", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "unsqueeze_18", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "expand_18", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "clone_18", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_239", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_233", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "unsqueeze_19", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "expand_19", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "clone_19", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_240", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_226", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_102", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_239", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_103", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_240", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_104", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_264", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_103", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_265", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_104", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_266", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_264", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_265", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_9", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_81", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_82", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_9", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_87", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_9", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_88", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "alias_default_267", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_267", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_105", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_105", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_241", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_86", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "dtype_cast_86", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_86", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "permute_106", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_241", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_268", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_106", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "alias_default_269", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_269", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "einsum_default_66", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_255", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_66", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9", - "name": "add_46", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_91", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "dtype_cast_87", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9", - "name": "alias_default_270", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_270", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_230", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_230", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_272", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_272", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "pow_20", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mean_19", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "add_47", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "rsqrt_19", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_273", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_272", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_273", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_67", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_87", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_271", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_67", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_271", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_68", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_68", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_231", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_87", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "dtype_cast_88", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_88", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "permute_107", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_231", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_274", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_107", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "alias_default_275", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_274", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_275", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "einsum_default_67", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_67", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "alias_default_276", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_276", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "convert_element_type_234", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_234", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_277", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_277", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "neg_9", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "exp_9", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "add_48", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_277", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "div_9", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "convert_element_type_235", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_89", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "dtype_cast_89", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_89", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "permute_108", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_108", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "alias_default_279", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_274", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_279", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "einsum_default_68", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_235", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_278", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_68", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "alias_default_280", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_278", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_280", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_69", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_88", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "dtype_cast_90", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_90", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "permute_109", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_69", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_281", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_109", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "alias_default_282", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_281", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_282", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "einsum_default_69", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_270", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_69", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9", - "name": "add_49", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_99", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "dtype_cast_91", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9", - "name": "alias_default_283", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_240", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_240", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_285", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_285", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "pow_21", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mean_20", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "add_50", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "rsqrt_20", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_286", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_285", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_286", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_70", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_91", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_284", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_284", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_71", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_241", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_92", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "dtype_cast_92", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_92", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "permute_110", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_241", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_287", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_110", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "alias_default_288", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_287", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_288", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "einsum_default_70", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_93", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "dtype_cast_93", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_93", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "permute_111", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_111", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "alias_default_289", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_287", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_289", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "einsum_default_71", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_94", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "dtype_cast_94", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_94", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "permute_112", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_112", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "alias_default_290", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_287", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_290", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "einsum_default_72", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_70", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_256", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_71", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_257", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_72", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_258", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_256", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_248", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_248", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_259", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_259", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_complex_20", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_257", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_249", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_249", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_260", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_260", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_complex_21", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_261", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_261", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_291", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_291", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "mul_72", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_72", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_real_20", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_262", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_291", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "mul_73", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_73", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_real_21", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_263", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_262", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_250", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_263", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_251", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_251", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "unsqueeze_20", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "expand_20", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "clone_20", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_264", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_258", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "unsqueeze_21", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "expand_21", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "clone_21", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_265", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_250", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_113", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_264", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_114", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_265", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_115", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_113", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_292", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_114", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_293", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_294", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_292", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_293", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_294", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_10", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_90", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_91", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_10", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_96", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_10", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_97", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "alias_default_295", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_295", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_116", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_116", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_266", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_95", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "dtype_cast_95", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_95", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "permute_117", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_296", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_117", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "alias_default_297", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_296", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_297", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "einsum_default_73", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_73", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10", - "name": "add_51", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_100", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "dtype_cast_96", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10", - "name": "alias_default_298", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_254", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_254", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_300", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "pow_22", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mean_21", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "add_52", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "rsqrt_21", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_301", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_74", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_96", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_299", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_299", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_75", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_75", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_255", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_96", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "dtype_cast_97", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_97", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "permute_118", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_255", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_302", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_118", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "alias_default_303", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_302", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_303", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "einsum_default_74", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_74", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "alias_default_304", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_304", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "convert_element_type_258", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_258", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_305", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_305", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "neg_10", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "exp_10", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "add_53", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_305", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "div_10", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "convert_element_type_259", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_98", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "dtype_cast_98", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_98", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "permute_119", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_119", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "alias_default_307", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_302", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_307", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "einsum_default_75", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_259", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_306", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_75", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "alias_default_308", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_306", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_308", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_76", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_97", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "dtype_cast_99", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_99", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "permute_120", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_76", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_309", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_120", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "alias_default_310", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_309", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_310", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "einsum_default_76", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_76", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10", - "name": "add_54", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_108", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "dtype_cast_100", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10", - "name": "alias_default_311", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_264", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_264", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_313", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_313", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "pow_23", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mean_22", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "add_55", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "rsqrt_22", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_314", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_313", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_314", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_77", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_100", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_312", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_312", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_78", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_78", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_265", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_101", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "dtype_cast_101", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_101", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "permute_121", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_265", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_315", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_121", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "alias_default_316", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_315", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_316", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "einsum_default_77", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_102", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "dtype_cast_102", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_102", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "permute_122", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_122", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "alias_default_317", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_315", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_317", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "einsum_default_78", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_103", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "dtype_cast_103", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_103", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "permute_123", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_123", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "alias_default_318", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_315", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_318", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "einsum_default_79", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_77", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_281", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_78", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_282", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_79", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_283", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_281", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_272", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_272", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_284", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_284", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_complex_22", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_282", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_273", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_273", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_285", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_285", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_complex_23", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_286", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_286", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_319", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_319", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "mul_79", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_79", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_real_22", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_287", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_319", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "mul_80", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_80", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_real_23", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_288", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_287", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_274", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_288", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_275", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_275", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "unsqueeze_22", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "expand_22", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "clone_22", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_289", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_283", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "unsqueeze_23", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "expand_23", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "clone_23", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_290", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_274", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_124", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_289", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_125", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_290", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_126", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_320", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_321", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_322", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_320", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_321", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_11", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_99", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_100", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_105", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_106", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_99", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "alias_default_323", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_127", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_127", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_291", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_104", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "dtype_cast_104", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_104", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "permute_128", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_324", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_128", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "alias_default_325", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_324", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_325", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "einsum_default_80", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_80", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11", - "name": "add_56", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_109", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "dtype_cast_105", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11", - "name": "alias_default_326", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_278", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_278", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_328", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_328", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "pow_24", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mean_23", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "add_57", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "rsqrt_23", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_329", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_328", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_329", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_81", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_105", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_327", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_327", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_82", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_82", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_279", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_105", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "dtype_cast_106", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_106", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "permute_129", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_279", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_330", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_129", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "alias_default_331", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_330", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_331", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "einsum_default_81", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "alias_default_332", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_332", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "convert_element_type_282", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_282", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_333", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_333", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "neg_11", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "exp_11", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "add_58", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_333", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "div_11", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "convert_element_type_283", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_107", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "dtype_cast_107", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_107", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "permute_130", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_130", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "alias_default_335", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_330", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_335", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "einsum_default_82", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_283", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_334", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "alias_default_336", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_334", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_336", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_83", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_106", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "dtype_cast_108", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_108", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "permute_131", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_337", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_131", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "alias_default_338", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_337", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_338", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "einsum_default_83", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_83", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11", - "name": "add_59", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_117", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "dtype_cast_109", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11", - "name": "alias_default_339", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_288", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_288", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_341", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_341", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "pow_25", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mean_24", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "add_60", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "rsqrt_24", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_342", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_341", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_342", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_84", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_109", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_340", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_84", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_340", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_85", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_85", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_289", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_110", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "dtype_cast_110", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_110", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "permute_132", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_289", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_343", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_132", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "alias_default_344", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_343", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_344", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "einsum_default_84", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_111", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "dtype_cast_111", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_111", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "permute_133", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_133", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "alias_default_345", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_343", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_345", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "einsum_default_85", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_112", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "dtype_cast_112", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_112", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "permute_134", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_134", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "alias_default_346", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_343", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_346", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "einsum_default_86", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_306", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_307", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_86", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_308", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_306", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_296", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_296", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_309", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_309", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_complex_24", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_307", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_297", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_297", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_310", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_310", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_complex_25", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_311", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_311", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_347", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_347", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "mul_86", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_86", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_real_24", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_312", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_347", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "mul_87", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_87", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_real_25", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_313", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_312", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_298", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_313", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_299", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_299", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "unsqueeze_24", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "expand_24", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "clone_24", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_314", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_308", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "unsqueeze_25", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "expand_25", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "clone_25", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_315", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_298", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_135", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_314", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_136", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_315", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_137", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_135", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_348", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_136", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_349", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_137", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_350", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_350", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_12", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_108", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_109", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_12", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_114", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_12", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_115", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_108", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "alias_default_351", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_351", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_138", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_138", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_316", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_113", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "dtype_cast_113", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_113", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "permute_139", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_316", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_352", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_139", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "alias_default_353", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_352", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_353", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "einsum_default_87", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12", - "name": "add_61", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_118", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "dtype_cast_114", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12", - "name": "alias_default_354", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_302", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_302", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_356", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_356", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "pow_26", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mean_25", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "add_62", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "rsqrt_25", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_357", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_356", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_357", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_88", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_114", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_355", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_88", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_355", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_89", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_89", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_303", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_114", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "dtype_cast_115", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_115", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "permute_140", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_303", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_358", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_140", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "alias_default_359", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_358", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_359", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "einsum_default_88", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_88", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "alias_default_360", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_360", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "convert_element_type_306", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_306", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_361", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_361", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "neg_12", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "exp_12", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "add_63", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_361", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "div_12", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "convert_element_type_307", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_116", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "dtype_cast_116", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_116", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "permute_141", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_141", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "alias_default_363", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_358", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_363", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "einsum_default_89", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_307", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_362", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_89", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "alias_default_364", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_362", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_364", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_90", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "dtype_cast_117", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_117", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "permute_142", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_90", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_365", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_142", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "alias_default_366", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_365", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_366", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "einsum_default_90", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_90", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12", - "name": "add_64", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_126", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "dtype_cast_118", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12", - "name": "alias_default_367", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_367", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_312", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_312", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_369", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "pow_27", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mean_26", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "add_65", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_65", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "rsqrt_26", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_370", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_91", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_118", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_368", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_368", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_92", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_92", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_313", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_119", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "dtype_cast_119", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_119", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "permute_143", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_313", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_371", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_143", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "alias_default_372", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_371", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_372", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "einsum_default_91", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_120", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "dtype_cast_120", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_120", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "permute_144", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_144", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "alias_default_373", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_371", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_373", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "einsum_default_92", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_121", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "dtype_cast_121", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_121", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "permute_145", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_145", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "alias_default_374", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_371", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_374", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "einsum_default_93", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_91", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_331", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_92", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_332", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_93", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_333", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_331", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_320", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_320", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_334", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_334", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_complex_26", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_332", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_321", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_321", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_335", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_335", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_complex_27", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_336", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_336", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_375", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_375", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "mul_93", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_93", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_real_26", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_337", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_375", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "mul_94", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_94", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_real_27", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_338", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_337", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_322", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_338", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_323", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_323", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "unsqueeze_26", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "expand_26", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "clone_26", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_339", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_333", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "unsqueeze_27", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "expand_27", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "clone_27", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_340", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_322", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_146", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_339", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_147", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_340", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_148", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_376", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_377", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_148", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_378", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_376", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_377", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_13", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_117", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_118", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_13", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_123", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_13", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_124", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_117", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "alias_default_379", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_379", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_149", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_149", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_341", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_122", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "dtype_cast_122", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_122", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "permute_150", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_341", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_380", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_150", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "alias_default_381", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_381", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "einsum_default_94", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_367", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_94", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13", - "name": "add_66", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_127", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "dtype_cast_123", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_66", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13", - "name": "alias_default_382", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_326", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_384", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_384", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "pow_28", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mean_27", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "add_67", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_67", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "rsqrt_27", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_385", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_384", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_385", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_95", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_123", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_383", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_95", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_383", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_96", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_327", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_123", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "dtype_cast_124", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_124", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "permute_151", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_327", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_386", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_151", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "alias_default_387", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_386", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_387", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "einsum_default_95", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_95", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "alias_default_388", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_388", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "convert_element_type_330", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_330", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_389", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_389", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "neg_13", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "exp_13", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "add_68", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_389", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_68", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "div_13", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "convert_element_type_331", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_125", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "dtype_cast_125", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_125", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "permute_152", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_152", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "alias_default_391", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_386", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_391", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "einsum_default_96", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_331", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_390", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_96", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "alias_default_392", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_97", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "dtype_cast_126", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_126", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "permute_153", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_97", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_393", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_153", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "alias_default_394", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_394", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "einsum_default_97", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_97", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13", - "name": "add_69", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_135", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "dtype_cast_127", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13", - "name": "alias_default_395", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_395", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_336", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_336", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_397", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_397", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "pow_29", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mean_28", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "add_70", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "rsqrt_28", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_398", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_397", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_398", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_98", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_127", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_396", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_98", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_396", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_99", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_99", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_337", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_128", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "dtype_cast_128", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_128", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "permute_154", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_337", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_399", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_154", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "alias_default_400", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_399", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_400", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "einsum_default_98", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_129", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "dtype_cast_129", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_129", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "permute_155", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_155", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "alias_default_401", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_399", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_401", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "einsum_default_99", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_130", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "dtype_cast_130", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_130", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "permute_156", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_156", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "alias_default_402", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_399", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_402", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "einsum_default_100", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_98", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_356", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_99", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_357", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_100", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_358", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_356", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_344", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_344", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_359", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_359", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_complex_28", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_357", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_345", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_345", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_360", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_360", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_complex_29", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_361", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_361", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_403", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_403", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "mul_100", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_100", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_real_28", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_362", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_403", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "mul_101", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_real_29", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_363", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_362", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_346", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_363", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_347", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_347", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "unsqueeze_28", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "expand_28", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "clone_28", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_364", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_358", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "unsqueeze_29", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "expand_29", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "clone_29", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_365", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_346", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_157", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_364", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_158", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_365", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_159", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_404", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_158", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_405", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_159", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_406", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_404", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_405", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_406", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_14", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_126", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_127", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_14", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_132", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_14", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_133", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "alias_default_407", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_407", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_160", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_160", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_366", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_131", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "dtype_cast_131", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_131", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "permute_161", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_366", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_408", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_161", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "alias_default_409", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_408", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_409", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "einsum_default_101", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_395", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_101", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14", - "name": "add_71", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_136", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "dtype_cast_132", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14", - "name": "alias_default_410", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_410", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_350", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_350", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_412", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_412", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "pow_30", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mean_29", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "add_72", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "rsqrt_29", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_413", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_412", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_413", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_102", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_132", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_411", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_411", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_103", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_103", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_351", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_132", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "dtype_cast_133", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_133", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "permute_162", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_351", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_414", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_162", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "alias_default_415", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_414", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_415", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "einsum_default_102", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "alias_default_416", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_416", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "convert_element_type_354", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_354", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_417", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_417", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "neg_14", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "exp_14", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "add_73", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_417", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_73", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "div_14", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "convert_element_type_355", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_134", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "dtype_cast_134", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_134", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "permute_163", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_163", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "alias_default_419", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_414", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_419", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "einsum_default_103", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_355", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_418", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "alias_default_420", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_418", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_420", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_104", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_133", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "dtype_cast_135", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_135", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "permute_164", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_421", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_164", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "alias_default_422", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_421", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_422", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "einsum_default_104", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_410", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_104", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14", - "name": "add_74", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_144", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "dtype_cast_136", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14", - "name": "alias_default_423", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_423", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_360", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_360", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_425", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_425", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "pow_31", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mean_30", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "add_75", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_75", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "rsqrt_30", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_426", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_425", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_426", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_105", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_136", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_424", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_105", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_424", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_106", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_106", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_361", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_137", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "dtype_cast_137", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_137", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "permute_165", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_361", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_427", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_165", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "alias_default_428", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_427", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_428", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "einsum_default_105", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_138", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "dtype_cast_138", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_138", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "permute_166", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_166", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "alias_default_429", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_427", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_429", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "einsum_default_106", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_139", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "dtype_cast_139", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_139", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "permute_167", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_167", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "alias_default_430", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_427", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_430", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "einsum_default_107", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_381", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_382", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_383", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_381", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_368", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_368", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_384", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_384", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_complex_30", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_382", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_369", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_369", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_385", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_385", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_complex_31", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_386", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_386", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_431", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_431", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "mul_107", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_real_30", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_387", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_431", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "mul_108", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_real_31", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_388", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_387", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_370", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_388", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_371", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_371", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "unsqueeze_30", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "expand_30", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "clone_30", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_389", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_383", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "unsqueeze_31", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "expand_31", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "clone_31", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_390", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_370", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_168", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_389", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_169", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_170", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_168", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_432", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_169", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_433", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_170", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_434", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_432", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_433", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_434", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_15", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_135", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_136", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_15", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_141", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_15", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_142", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_135", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "alias_default_435", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_435", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_171", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_171", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_391", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_140", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "dtype_cast_140", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_140", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "permute_172", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_391", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_436", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_172", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "alias_default_437", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_436", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_437", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "einsum_default_108", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_423", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_108", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15", - "name": "add_76", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_145", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "dtype_cast_141", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_76", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15", - "name": "alias_default_438", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_438", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_374", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_374", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_440", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_440", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "pow_32", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mean_31", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "add_77", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "rsqrt_31", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_441", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_440", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_441", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_109", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_141", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_439", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_439", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_110", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_110", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_375", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_141", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "dtype_cast_142", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_142", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "permute_173", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_375", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_442", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_173", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "alias_default_443", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_442", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_443", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "einsum_default_109", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "alias_default_444", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_444", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "convert_element_type_378", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_378", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_445", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_445", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "neg_15", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "exp_15", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "add_78", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_445", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_78", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "div_15", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "convert_element_type_379", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_143", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "dtype_cast_143", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_143", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "permute_174", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_174", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "alias_default_447", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_442", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_447", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "einsum_default_110", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_379", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_446", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "alias_default_448", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_446", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_448", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_111", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_142", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "dtype_cast_144", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_144", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "permute_175", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_111", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_449", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_175", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "alias_default_450", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_449", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_450", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "einsum_default_111", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_438", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_111", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15", - "name": "add_79", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_153", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "dtype_cast_145", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_79", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15", - "name": "alias_default_451", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_451", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_384", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_384", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_453", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_453", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "pow_33", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mean_32", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "add_80", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_80", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "rsqrt_32", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_454", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_453", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_454", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_112", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_145", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_452", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_112", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_452", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_113", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_113", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_385", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_146", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "dtype_cast_146", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_146", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "permute_176", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_385", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_455", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_176", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "alias_default_456", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_455", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_456", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "einsum_default_112", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_147", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "dtype_cast_147", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_147", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "permute_177", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_177", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "alias_default_457", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_455", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_457", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "einsum_default_113", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_148", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "dtype_cast_148", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_148", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "permute_178", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_178", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "alias_default_458", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_455", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_458", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "einsum_default_114", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_406", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_407", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_408", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_406", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_392", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_409", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_409", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_complex_32", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_407", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_393", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_410", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_410", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_complex_33", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_411", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_411", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_459", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_459", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "mul_114", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_real_32", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_412", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_459", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "mul_115", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_115", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_real_33", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_413", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_412", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_394", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_413", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_395", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_395", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "unsqueeze_32", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "expand_32", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "clone_32", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_414", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_408", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "unsqueeze_33", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "expand_33", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "clone_33", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_415", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_394", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_179", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_414", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_180", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_415", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_181", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_179", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_460", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_180", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_461", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_181", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_462", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_460", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_461", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_462", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_16", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_144", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_145", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_16", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_150", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_16", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_151", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_144", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "alias_default_463", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_463", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_182", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_182", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_416", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_149", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "dtype_cast_149", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_149", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "permute_183", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_416", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_464", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_183", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "alias_default_465", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_464", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_465", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "einsum_default_115", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_451", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16", - "name": "add_81", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_154", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "dtype_cast_150", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16", - "name": "alias_default_466", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_466", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_398", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_398", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_468", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_468", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "pow_34", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mean_33", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "add_82", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_82", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "rsqrt_33", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_469", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_468", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_469", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_116", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_150", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_467", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_116", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_467", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_117", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_117", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_399", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_150", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "dtype_cast_151", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_151", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "permute_184", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_399", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_470", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_184", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "alias_default_471", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_470", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_471", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "einsum_default_116", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_116", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "alias_default_472", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_472", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "convert_element_type_402", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_402", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_473", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_473", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "neg_16", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "exp_16", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "add_83", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_473", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "div_16", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "convert_element_type_403", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_152", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "dtype_cast_152", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_152", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "permute_185", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_185", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "alias_default_475", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_470", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_475", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "einsum_default_117", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_403", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_474", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_117", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "alias_default_476", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_474", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_476", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_118", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_151", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "dtype_cast_153", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_153", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "permute_186", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_118", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_477", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_186", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "alias_default_478", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_477", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_478", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "einsum_default_118", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_466", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_118", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16", - "name": "add_84", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_162", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "dtype_cast_154", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_84", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16", - "name": "alias_default_479", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_479", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_408", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_408", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_481", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_481", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "pow_35", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mean_34", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "add_85", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_85", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "rsqrt_34", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_482", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_481", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_482", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_119", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_154", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_480", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_119", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_480", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_120", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_120", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_409", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_155", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "dtype_cast_155", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_155", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "permute_187", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_409", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_483", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_187", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "alias_default_484", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_483", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_484", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "einsum_default_119", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_156", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "dtype_cast_156", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_156", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "permute_188", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_188", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "alias_default_485", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_483", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_485", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "einsum_default_120", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_157", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "dtype_cast_157", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_157", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "permute_189", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_189", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "alias_default_486", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_483", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_486", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "einsum_default_121", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_119", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_431", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_432", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_433", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_431", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_416", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_416", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_434", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_434", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_complex_34", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_432", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_417", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_417", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_435", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_435", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_complex_35", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_436", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_436", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_487", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_487", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "mul_121", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_real_34", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_437", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_487", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "mul_122", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_122", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_real_35", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_438", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_437", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_418", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_438", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_419", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_419", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "unsqueeze_34", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "expand_34", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "clone_34", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_439", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_433", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "unsqueeze_35", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "expand_35", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "clone_35", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_440", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_418", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_190", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_439", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_191", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_440", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_192", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_190", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_488", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_191", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_489", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_192", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_490", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_488", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_489", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_490", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_17", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_153", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_154", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_17", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_159", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_17", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_160", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_153", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "alias_default_491", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_491", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_193", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_193", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_441", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_158", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "dtype_cast_158", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_158", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "permute_194", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_441", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_492", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_194", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "alias_default_493", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_492", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_493", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "einsum_default_122", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_479", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_122", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17", - "name": "add_86", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_163", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "dtype_cast_159", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_86", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17", - "name": "alias_default_494", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_494", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_422", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_422", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_496", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_496", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "pow_36", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mean_35", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "add_87", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "rsqrt_35", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_497", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_496", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_497", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_123", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_159", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_495", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_123", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_495", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_124", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_423", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_159", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "dtype_cast_160", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_160", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "permute_195", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_423", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_498", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_195", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "alias_default_499", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_498", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_499", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "einsum_default_123", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_123", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "alias_default_500", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_500", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "convert_element_type_426", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_426", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_501", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_501", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "neg_17", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "exp_17", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "add_88", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_501", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_88", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "div_17", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "convert_element_type_427", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_161", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "dtype_cast_161", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_161", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "permute_196", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_196", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "alias_default_503", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_498", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_503", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "einsum_default_124", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_427", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_502", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_124", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "alias_default_504", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_502", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_504", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_125", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_160", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "dtype_cast_162", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_162", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "permute_197", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_125", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_505", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_197", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "alias_default_506", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_505", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_506", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "einsum_default_125", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_494", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_125", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17", - "name": "add_89", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_171", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "dtype_cast_163", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_89", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17", - "name": "alias_default_507", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_507", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_432", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_432", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_509", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_509", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "pow_37", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mean_36", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "add_90", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "rsqrt_36", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_510", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_509", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_510", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_126", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_163", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_508", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_508", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_127", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_433", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_164", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "dtype_cast_164", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_164", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "permute_198", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_433", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_511", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_198", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "alias_default_512", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_511", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_512", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "einsum_default_126", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_165", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "dtype_cast_165", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_165", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "permute_199", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_199", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "alias_default_513", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_511", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_513", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "einsum_default_127", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_166", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "dtype_cast_166", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_166", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "permute_200", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_200", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "alias_default_514", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_511", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_514", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "einsum_default_128", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_126", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_456", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_127", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_457", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_128", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_458", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_456", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_440", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_440", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_459", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_459", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_complex_36", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_457", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_441", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_441", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_460", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_460", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_complex_37", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_461", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_461", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_515", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_515", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "mul_128", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_128", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_real_36", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_462", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_515", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "mul_129", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_129", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_real_37", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_463", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_462", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_442", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_463", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_443", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_443", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "unsqueeze_36", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "expand_36", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "clone_36", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_464", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_458", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "unsqueeze_37", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "expand_37", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "clone_37", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_465", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_442", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_201", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_464", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_202", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_465", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_203", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_201", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_516", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_517", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_203", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_518", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_516", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_517", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_518", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_18", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_162", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_163", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_18", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_168", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_18", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_169", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_162", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "alias_default_519", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_519", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_204", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_204", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_466", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_167", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "dtype_cast_167", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_167", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "permute_205", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_466", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_520", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_205", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "alias_default_521", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_520", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_521", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "einsum_default_129", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_507", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_129", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18", - "name": "add_91", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_172", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "dtype_cast_168", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18", - "name": "alias_default_522", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_522", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_446", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_446", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_524", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_524", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "pow_38", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mean_37", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "add_92", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_92", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "rsqrt_37", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_525", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_524", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_525", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_130", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_168", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_523", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_523", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_131", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_131", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_447", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_168", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "dtype_cast_169", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_169", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "permute_206", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_447", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_526", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_206", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "alias_default_527", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_526", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_527", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "einsum_default_130", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_130", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "alias_default_528", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_528", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "convert_element_type_450", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_450", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_529", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_529", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "neg_18", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "exp_18", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "add_93", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_529", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_93", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "div_18", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "convert_element_type_451", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_170", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "dtype_cast_170", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_170", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "permute_207", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_207", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "alias_default_531", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_526", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_531", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "einsum_default_131", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_451", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_530", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_131", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "alias_default_532", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_530", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_532", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_132", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_169", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "dtype_cast_171", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_171", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "permute_208", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_132", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_533", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_208", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "alias_default_534", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_533", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_534", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "einsum_default_132", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_522", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_132", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18", - "name": "add_94", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_180", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "dtype_cast_172", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_94", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18", - "name": "alias_default_535", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_535", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_456", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_456", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_537", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_537", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "pow_39", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mean_38", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "add_95", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_95", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "rsqrt_38", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_538", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_537", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_538", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_133", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_172", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_536", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_133", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_536", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_134", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_134", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_457", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_173", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "dtype_cast_173", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_173", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "permute_209", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_457", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_539", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_209", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "alias_default_540", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_539", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_540", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "einsum_default_133", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_174", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "dtype_cast_174", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_174", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "permute_210", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_210", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "alias_default_541", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_539", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_541", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "einsum_default_134", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_175", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "dtype_cast_175", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_175", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "permute_211", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_211", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "alias_default_542", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_539", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_542", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "einsum_default_135", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_133", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_481", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_134", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_482", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_135", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_483", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_481", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_464", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_464", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_484", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_484", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_complex_38", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_482", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_465", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_465", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_485", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_485", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_complex_39", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_486", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_486", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_543", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_543", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "mul_135", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_135", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_real_38", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_487", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_543", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "mul_136", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_136", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_real_39", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_488", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_487", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_466", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_488", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_467", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_467", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "unsqueeze_38", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "expand_38", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "clone_38", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_489", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_483", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "unsqueeze_39", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "expand_39", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "clone_39", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_490", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_466", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_212", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_489", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_213", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_490", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_214", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_212", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_544", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_213", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_545", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_546", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_544", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_545", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_546", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_19", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_171", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_172", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_19", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_177", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_19", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_178", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "alias_default_547", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_547", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_215", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_215", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_491", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_176", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "dtype_cast_176", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_176", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "permute_216", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_491", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_548", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_216", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "alias_default_549", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_548", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_549", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "einsum_default_136", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_535", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_136", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19", - "name": "add_96", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_181", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "dtype_cast_177", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19", - "name": "alias_default_550", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_550", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_470", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_470", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_552", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_552", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "pow_40", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mean_39", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "add_97", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_97", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "rsqrt_39", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_553", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_552", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_553", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_137", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_177", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_551", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_137", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_551", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_138", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_138", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_471", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_177", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "dtype_cast_178", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_178", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "permute_217", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_471", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_554", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_217", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "alias_default_555", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_554", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_555", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "einsum_default_137", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "alias_default_556", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_556", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "convert_element_type_474", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_474", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_557", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_557", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "neg_19", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "exp_19", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "add_98", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_557", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_98", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "div_19", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "convert_element_type_475", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_179", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "dtype_cast_179", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_179", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "permute_218", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_218", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "alias_default_559", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_554", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_559", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "einsum_default_138", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_475", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_558", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "alias_default_560", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_558", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_560", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_139", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_178", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "dtype_cast_180", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_180", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "permute_219", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_139", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_561", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_219", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "alias_default_562", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_561", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_562", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "einsum_default_139", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_550", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_139", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19", - "name": "add_99", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_189", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "dtype_cast_181", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_99", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19", - "name": "alias_default_563", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_563", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_480", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_480", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_565", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_565", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "pow_41", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mean_40", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "add_100", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_100", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "rsqrt_40", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_566", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_565", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_566", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_140", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_181", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_564", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_140", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_564", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_141", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_481", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_182", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "dtype_cast_182", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_182", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "permute_220", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_481", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_567", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_220", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "alias_default_568", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_567", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_568", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "einsum_default_140", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_183", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "dtype_cast_183", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_183", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "permute_221", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_221", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "alias_default_569", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_567", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_569", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "einsum_default_141", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_184", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "dtype_cast_184", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_184", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "permute_222", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_222", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "alias_default_570", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_567", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_570", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "einsum_default_142", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_140", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_506", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_141", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_507", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_142", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_508", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_506", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_488", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_488", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_509", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_509", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_complex_40", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_507", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_489", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_489", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_510", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_510", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_complex_41", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_511", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_511", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_571", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_571", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "mul_142", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_142", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_real_40", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_512", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_571", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "mul_143", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_143", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_real_41", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_513", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_512", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_490", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_513", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_491", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_491", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "unsqueeze_40", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "expand_40", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "clone_40", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_514", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_508", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "unsqueeze_41", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "expand_41", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "clone_41", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_515", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_490", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_223", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_514", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_224", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_515", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_225", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_223", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_572", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_224", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_573", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_225", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_574", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_572", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_573", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_574", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_20", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_180", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_181", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_20", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_186", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_20", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_187", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_180", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "alias_default_575", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_575", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_226", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_226", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_516", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_185", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "dtype_cast_185", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_185", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "permute_227", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_516", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_576", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_227", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "alias_default_577", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_576", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_577", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "einsum_default_143", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_563", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_143", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20", - "name": "add_101", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_190", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "dtype_cast_186", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_101", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20", - "name": "alias_default_578", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_578", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_494", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_494", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_580", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_580", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "pow_42", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mean_41", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "add_102", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "rsqrt_41", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_581", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_580", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_581", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_144", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_186", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_579", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_144", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_579", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_145", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_495", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_186", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "dtype_cast_187", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_187", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "permute_228", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_495", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_582", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_228", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "alias_default_583", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_582", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_583", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "einsum_default_144", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_144", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "alias_default_584", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_584", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "convert_element_type_498", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_498", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_585", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_585", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "neg_20", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "exp_20", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "add_103", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_585", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "div_20", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "convert_element_type_499", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_188", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "dtype_cast_188", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_188", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "permute_229", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_229", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "alias_default_587", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_582", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_587", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "einsum_default_145", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_499", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_586", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_145", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "alias_default_588", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_586", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_588", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_146", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_187", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "dtype_cast_189", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_189", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "permute_230", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_146", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_589", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_230", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "alias_default_590", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_589", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_590", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "einsum_default_146", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_578", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_146", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20", - "name": "add_104", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_198", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "dtype_cast_190", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_104", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20", - "name": "alias_default_591", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_591", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_504", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_504", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_593", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_593", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "pow_43", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mean_42", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "add_105", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_105", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "rsqrt_42", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_594", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_593", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_594", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_147", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_190", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_592", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_592", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_148", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_148", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_505", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_191", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "dtype_cast_191", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_191", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "permute_231", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_505", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_595", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_231", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "alias_default_596", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_595", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_596", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "einsum_default_147", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_192", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "dtype_cast_192", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_192", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "permute_232", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_232", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "alias_default_597", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_595", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_597", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "einsum_default_148", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_193", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "dtype_cast_193", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_193", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "permute_233", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_233", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "alias_default_598", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_595", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_598", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "einsum_default_149", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_147", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_531", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_148", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_532", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_533", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_531", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_512", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_512", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_534", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_534", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_complex_42", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_532", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_513", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_513", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_535", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_535", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_complex_43", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_536", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_536", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_599", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_599", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "mul_149", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_real_42", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_537", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_599", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "mul_150", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_150", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_real_43", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_538", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_537", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_514", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_538", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_515", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_515", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "unsqueeze_42", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "expand_42", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "clone_42", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_539", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_533", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "unsqueeze_43", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "expand_43", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "clone_43", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_540", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_514", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_234", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_539", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_235", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_540", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_236", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_234", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_600", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_235", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_601", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_602", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_600", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_601", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_602", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_21", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_189", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_190", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_21", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_195", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_21", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_196", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "alias_default_603", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_603", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_237", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_237", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_541", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_194", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "dtype_cast_194", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_194", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "permute_238", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_541", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_604", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_238", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "alias_default_605", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_604", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_605", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "einsum_default_150", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_591", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_150", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21", - "name": "add_106", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_199", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "dtype_cast_195", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_106", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21", - "name": "alias_default_606", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_606", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_518", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_518", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_608", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_608", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "pow_44", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mean_43", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "add_107", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_107", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "rsqrt_43", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_609", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_608", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_609", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_151", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_195", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_607", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_151", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_607", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_152", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_152", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_519", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_195", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "dtype_cast_196", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_196", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "permute_239", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_519", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_610", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_239", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "alias_default_611", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_610", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_611", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "einsum_default_151", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_151", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "alias_default_612", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_612", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "convert_element_type_522", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_522", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_613", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_613", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "neg_21", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "exp_21", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "add_108", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_613", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "div_21", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "convert_element_type_523", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_197", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "dtype_cast_197", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_197", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "permute_240", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_240", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "alias_default_615", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_610", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_615", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "einsum_default_152", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_523", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_614", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_152", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "alias_default_616", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_614", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_616", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_153", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_196", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "dtype_cast_198", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_198", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "permute_241", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_153", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_617", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_241", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "alias_default_618", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_617", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_618", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "einsum_default_153", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_606", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_153", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21", - "name": "add_109", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_207", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "dtype_cast_199", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21", - "name": "alias_default_619", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_619", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_528", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_528", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_621", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_621", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "pow_45", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mean_44", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "add_110", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_110", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "rsqrt_44", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_622", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_621", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_622", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_154", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_199", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_620", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_620", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_155", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_155", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_529", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_200", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "dtype_cast_200", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_200", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "permute_242", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_529", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_623", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_242", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "alias_default_624", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_623", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_624", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "einsum_default_154", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_201", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "dtype_cast_201", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_201", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "permute_243", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_243", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "alias_default_625", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_623", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_625", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "einsum_default_155", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_202", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "dtype_cast_202", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_202", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "permute_244", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_244", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "alias_default_626", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_623", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_626", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "einsum_default_156", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_154", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_556", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_155", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_557", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_156", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_558", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_556", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_536", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_536", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_559", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_559", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_complex_44", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_557", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_537", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_537", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_560", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_560", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_complex_45", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_561", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_561", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_627", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_627", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "mul_156", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_156", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_real_44", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_562", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_627", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "mul_157", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_157", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_real_45", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_563", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_562", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_538", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_563", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_539", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_539", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "unsqueeze_44", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "expand_44", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "clone_44", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_564", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_558", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "unsqueeze_45", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "expand_45", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "clone_45", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_565", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_538", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_245", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_564", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_246", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_565", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_247", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_245", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_628", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_246", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_629", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_247", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_630", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_628", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_629", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_630", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_22", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_198", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_199", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_22", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_204", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_22", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_205", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_198", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "alias_default_631", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_631", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_248", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_248", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_566", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_203", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "dtype_cast_203", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_203", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "permute_249", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_566", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_632", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_249", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "alias_default_633", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_632", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_633", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "einsum_default_157", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_619", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22", - "name": "add_111", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_208", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "dtype_cast_204", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_111", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22", - "name": "alias_default_634", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_634", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_542", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_542", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_636", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_636", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "pow_46", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mean_45", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "add_112", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_112", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "rsqrt_45", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_637", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_636", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_637", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_158", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_204", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_635", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_158", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_635", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_159", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_159", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_543", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_204", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "dtype_cast_205", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_205", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "permute_250", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_543", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_638", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_250", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "alias_default_639", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_638", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_639", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "einsum_default_158", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_158", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "alias_default_640", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_640", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "convert_element_type_546", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_546", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_641", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_641", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "neg_22", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "exp_22", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "add_113", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_641", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "div_22", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "convert_element_type_547", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_206", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "dtype_cast_206", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_206", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "permute_251", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_251", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "alias_default_643", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_638", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_643", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "einsum_default_159", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_547", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_642", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_159", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "alias_default_644", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_642", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_644", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_160", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_205", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "dtype_cast_207", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_207", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "permute_252", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_160", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_645", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_252", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "alias_default_646", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_645", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_646", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "einsum_default_160", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_634", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_160", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22", - "name": "add_114", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_216", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "dtype_cast_208", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_114", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22", - "name": "alias_default_647", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_647", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_552", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_552", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_649", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_649", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "pow_47", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mean_46", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "add_115", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "rsqrt_46", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_650", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_649", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_650", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_161", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_208", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_648", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_161", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_648", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_162", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_162", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_553", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_209", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "dtype_cast_209", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_209", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "permute_253", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_553", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_651", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_253", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "alias_default_652", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_651", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_652", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "einsum_default_161", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_210", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "dtype_cast_210", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_210", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "permute_254", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_254", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "alias_default_653", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_651", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_653", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "einsum_default_162", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_211", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "dtype_cast_211", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_211", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "permute_255", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_255", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "alias_default_654", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_651", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_654", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "einsum_default_163", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_161", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_581", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_162", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_582", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_163", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_583", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_581", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_560", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_560", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_584", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_584", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_complex_46", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_582", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_561", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_561", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_585", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_585", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_complex_47", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_586", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_586", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_655", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_655", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "mul_163", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_163", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_real_46", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_587", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_655", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "mul_164", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_164", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_real_47", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_588", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_587", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_562", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_588", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_563", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_563", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "unsqueeze_46", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "expand_46", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "clone_46", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_589", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_583", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "unsqueeze_47", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "expand_47", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "clone_47", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_590", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_562", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_256", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_589", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_257", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_590", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_258", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_256", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_656", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_257", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_657", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_658", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_656", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_657", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_658", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_23", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_207", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_208", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_23", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_213", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_23", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_214", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_207", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "alias_default_659", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_659", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_259", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_259", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_591", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_212", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "dtype_cast_212", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_212", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "permute_260", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_591", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_660", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_260", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "alias_default_661", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_660", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_661", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "einsum_default_164", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_647", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_164", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23", - "name": "add_116", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_217", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "dtype_cast_213", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_116", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23", - "name": "alias_default_662", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_662", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_566", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_566", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_664", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_664", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "pow_48", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mean_47", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "add_117", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_117", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "rsqrt_47", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_665", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_664", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_665", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_165", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_213", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_663", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_165", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_663", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_166", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_166", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_567", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_213", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "dtype_cast_214", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_214", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "permute_261", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_567", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_666", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_261", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "alias_default_667", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_666", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_667", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "einsum_default_165", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "alias_default_668", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_668", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "convert_element_type_570", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_570", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_669", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_669", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "neg_23", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "exp_23", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "add_118", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_669", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_118", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "div_23", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "convert_element_type_571", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_215", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "dtype_cast_215", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_215", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "permute_262", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_262", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "alias_default_671", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_666", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_671", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "einsum_default_166", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_571", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_670", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_166", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "alias_default_672", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_670", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_672", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_167", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "dtype_cast_216", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_216", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "permute_263", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_167", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_673", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_263", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "alias_default_674", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_673", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_674", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "einsum_default_167", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_662", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_167", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23", - "name": "add_119", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_225", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "dtype_cast_217", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_119", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23", - "name": "alias_default_675", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_675", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_576", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_576", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_677", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_677", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "pow_49", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mean_48", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "add_120", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_120", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "rsqrt_48", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_678", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_677", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_678", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_168", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_217", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_676", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_168", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_676", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_169", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_169", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_577", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_218", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "dtype_cast_218", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_218", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "permute_264", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_577", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_679", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_264", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "alias_default_680", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_679", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_680", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "einsum_default_168", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_219", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "dtype_cast_219", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_219", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "permute_265", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_265", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "alias_default_681", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_679", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_681", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "einsum_default_169", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_220", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "dtype_cast_220", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_220", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "permute_266", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_266", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "alias_default_682", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_679", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_682", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "einsum_default_170", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_606", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_169", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_607", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_170", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_608", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_606", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_584", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_584", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_609", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_609", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_complex_48", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_607", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_585", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_585", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_610", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_610", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_complex_49", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_611", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_611", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_683", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_683", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "mul_170", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_170", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_real_48", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_612", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_683", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "mul_171", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_171", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_real_49", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_613", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_612", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_586", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_613", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_587", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_587", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "unsqueeze_48", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "expand_48", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "clone_48", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_614", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_608", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "unsqueeze_49", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "expand_49", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "clone_49", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_615", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_586", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_267", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_614", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_268", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_615", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_269", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_267", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_684", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_685", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_686", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_684", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_685", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_686", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_24", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_216", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_217", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_24", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_222", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_24", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_223", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_216", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "alias_default_687", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_687", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_270", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_270", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_616", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_221", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "dtype_cast_221", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_221", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "permute_271", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_616", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_688", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_271", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "alias_default_689", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_688", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_689", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "einsum_default_171", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_675", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24", - "name": "add_121", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_226", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "dtype_cast_222", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_121", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24", - "name": "alias_default_690", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_690", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_590", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_590", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_692", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_692", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "pow_50", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mean_49", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "add_122", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_122", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "rsqrt_49", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_693", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_692", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_693", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_172", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_222", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_691", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_691", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_173", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_591", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_222", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "dtype_cast_223", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_223", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "permute_272", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_591", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_694", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_272", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "alias_default_695", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_694", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_695", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "einsum_default_172", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_172", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "alias_default_696", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_696", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "convert_element_type_594", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_594", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_697", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_697", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "neg_24", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "exp_24", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "add_123", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_697", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_123", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "div_24", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "convert_element_type_595", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_224", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "dtype_cast_224", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_224", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "permute_273", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_273", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "alias_default_699", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_694", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_699", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "einsum_default_173", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_595", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_698", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_173", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "alias_default_700", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_698", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_700", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_174", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_223", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "dtype_cast_225", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_225", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "permute_274", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_174", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_701", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_274", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "alias_default_702", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_701", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_702", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "einsum_default_174", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_690", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_174", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24", - "name": "add_124", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_234", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "dtype_cast_226", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24", - "name": "alias_default_703", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_703", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_600", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_600", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_705", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_705", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "pow_51", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mean_50", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "add_125", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "rsqrt_50", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_706", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_705", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_706", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_175", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_226", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_704", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_175", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_704", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_176", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_176", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_601", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_227", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "dtype_cast_227", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_227", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "permute_275", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_601", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_707", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_275", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "alias_default_708", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_707", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_708", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "einsum_default_175", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_228", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "dtype_cast_228", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_228", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "permute_276", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_276", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "alias_default_709", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_707", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_709", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "einsum_default_176", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_229", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "dtype_cast_229", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_229", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "permute_277", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_277", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "alias_default_710", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_707", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_710", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "einsum_default_177", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_175", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_631", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_176", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_632", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_177", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_633", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_631", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_608", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_608", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_634", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_634", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_complex_50", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_632", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_609", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_609", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_635", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_635", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_complex_51", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_636", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_636", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_711", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_711", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "mul_177", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_177", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_real_50", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_637", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_711", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "mul_178", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_178", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_real_51", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_638", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_637", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_610", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_638", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_611", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_611", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "unsqueeze_50", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "expand_50", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "clone_50", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_639", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_633", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "unsqueeze_51", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "expand_51", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "clone_51", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_640", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_610", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_278", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_639", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_279", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_640", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_280", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_278", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_712", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_279", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_713", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_280", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_714", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_712", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_713", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_714", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_25", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_225", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_226", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_25", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_231", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_25", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_232", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_225", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "alias_default_715", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_715", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_281", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_281", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_641", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_230", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "dtype_cast_230", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_230", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "permute_282", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_641", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_716", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_282", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "alias_default_717", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_716", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_717", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "einsum_default_178", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_703", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_178", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25", - "name": "add_126", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_235", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "dtype_cast_231", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25", - "name": "alias_default_718", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_718", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_614", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_614", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_720", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_720", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "pow_52", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mean_51", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "add_127", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "rsqrt_51", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_721", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_720", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_721", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_179", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_231", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_719", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_179", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_719", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_180", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_180", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_615", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_231", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "dtype_cast_232", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_232", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "permute_283", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_615", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_722", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_283", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "alias_default_723", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_722", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_723", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "einsum_default_179", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_179", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "alias_default_724", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_724", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "convert_element_type_618", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_618", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_725", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_725", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "neg_25", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "exp_25", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "add_128", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_725", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_128", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "div_25", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "convert_element_type_619", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_233", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "dtype_cast_233", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_233", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "permute_284", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_284", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "alias_default_727", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_722", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_727", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "einsum_default_180", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_619", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_726", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_180", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "alias_default_728", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_726", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_728", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_181", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_232", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "dtype_cast_234", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_234", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "permute_285", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_181", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_729", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_285", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "alias_default_730", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_729", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_730", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "einsum_default_181", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_718", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_181", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25", - "name": "add_129", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_243", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "dtype_cast_235", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_129", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25", - "name": "alias_default_731", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_731", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_624", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_624", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_733", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_733", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "pow_53", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_53", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mean_52", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "add_130", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "rsqrt_52", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_734", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_733", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_734", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_182", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_235", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_732", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_182", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_732", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_183", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_183", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_625", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_236", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "dtype_cast_236", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_236", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "permute_286", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_625", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_735", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_286", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "alias_default_736", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_735", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_736", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "einsum_default_182", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_237", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "dtype_cast_237", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_237", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "permute_287", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_287", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "alias_default_737", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_735", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_737", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "einsum_default_183", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_238", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "dtype_cast_238", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_238", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "permute_288", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_288", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "alias_default_738", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_735", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_738", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "einsum_default_184", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_182", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_656", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_183", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_657", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_184", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_658", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_656", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_632", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_632", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_659", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_659", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_complex_52", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_657", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_633", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_633", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_660", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_660", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_complex_53", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_661", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_661", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_739", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_739", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "mul_184", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_184", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_real_52", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_662", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_739", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "mul_185", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_185", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_real_53", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_663", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_662", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_634", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_663", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_635", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_635", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "unsqueeze_52", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "expand_52", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "clone_52", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_664", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_658", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "unsqueeze_53", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "expand_53", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "clone_53", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_665", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_634", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_289", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_664", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_290", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_665", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_291", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_289", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_740", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_290", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_741", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_742", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_740", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_741", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_742", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_26", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_234", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_235", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_26", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_240", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_26", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_241", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_234", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "alias_default_743", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_743", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_292", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_292", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_666", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_239", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "dtype_cast_239", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_239", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "permute_293", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_666", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_744", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_293", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "alias_default_745", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_744", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_745", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "einsum_default_185", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_731", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_185", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26", - "name": "add_131", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_244", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "dtype_cast_240", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_131", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26", - "name": "alias_default_746", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_746", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_638", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_638", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_748", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_748", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "pow_54", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mean_53", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_53", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "add_132", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_132", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "rsqrt_53", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_53", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_749", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_748", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_749", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_186", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_240", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_747", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_747", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_187", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_187", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_639", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_240", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "dtype_cast_241", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_241", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "permute_294", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_639", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_750", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_294", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "alias_default_751", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_750", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_751", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "einsum_default_186", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_186", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "alias_default_752", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_752", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "convert_element_type_642", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_642", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_753", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_753", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "neg_26", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "exp_26", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "add_133", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_753", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_133", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "div_26", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "convert_element_type_643", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_242", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "dtype_cast_242", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_242", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "permute_295", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_295", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "alias_default_755", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_750", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_755", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "einsum_default_187", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_643", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_754", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_187", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "alias_default_756", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_754", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_756", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_188", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_241", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "dtype_cast_243", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_243", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "permute_296", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_188", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_757", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_296", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "alias_default_758", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_757", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_758", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "einsum_default_188", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_746", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_188", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26", - "name": "add_134", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_252", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "dtype_cast_244", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_134", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26", - "name": "alias_default_759", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_759", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_648", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_648", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_761", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_761", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "pow_55", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mean_54", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "add_135", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_135", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "rsqrt_54", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_762", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_761", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_762", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_189", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_244", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_760", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_760", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_190", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_190", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_649", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_245", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "dtype_cast_245", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_245", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "permute_297", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_649", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_763", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_297", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "alias_default_764", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_763", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_764", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "einsum_default_189", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_246", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "dtype_cast_246", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_246", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "permute_298", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_298", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "alias_default_765", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_763", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_765", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "einsum_default_190", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_247", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "dtype_cast_247", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_247", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "permute_299", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_299", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "alias_default_766", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_763", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_766", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "einsum_default_191", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_189", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_681", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_190", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_682", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_191", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_683", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_681", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_656", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_656", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_684", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_684", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_complex_54", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_682", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_657", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_657", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_685", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_685", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_complex_55", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_686", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_686", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_767", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_767", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "mul_191", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_191", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_real_54", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_687", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_767", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "mul_192", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_192", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_real_55", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_688", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_687", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_658", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_688", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_659", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_659", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "unsqueeze_54", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "expand_54", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "clone_54", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_689", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_683", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "unsqueeze_55", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "expand_55", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "clone_55", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_690", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_658", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_300", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_689", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_301", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_690", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_302", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_768", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_769", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_302", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_770", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_768", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_769", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_770", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_27", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_243", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_244", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_27", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_249", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_27", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_250", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_243", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "alias_default_771", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_771", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_303", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_303", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_691", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_248", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "dtype_cast_248", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_248", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "permute_304", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_691", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_772", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_304", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "alias_default_773", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_772", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_773", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "einsum_default_192", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_759", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_192", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27", - "name": "add_136", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_253", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "dtype_cast_249", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_136", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27", - "name": "alias_default_774", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_774", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_662", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_662", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_776", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_776", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "pow_56", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mean_55", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "add_137", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_137", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "rsqrt_55", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_777", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_776", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_777", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_193", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_249", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_775", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_193", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_775", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_194", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_194", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_663", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_249", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "dtype_cast_250", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_250", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "permute_305", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_663", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_778", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_305", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "alias_default_779", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_778", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_779", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "einsum_default_193", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_193", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "alias_default_780", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_780", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "convert_element_type_666", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_666", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_781", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_781", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "neg_27", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "exp_27", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "add_138", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_781", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "div_27", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "convert_element_type_667", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_251", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "dtype_cast_251", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_251", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "permute_306", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_306", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "alias_default_783", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_778", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_783", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "einsum_default_194", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_667", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_782", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_194", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "alias_default_784", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_782", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_784", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_195", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_250", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "dtype_cast_252", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_252", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "permute_307", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_195", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_785", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_307", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "alias_default_786", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_785", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_786", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "einsum_default_195", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_774", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_195", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27", - "name": "add_139", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_261", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "dtype_cast_253", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_139", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27", - "name": "alias_default_787", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_787", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_672", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_672", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_789", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_789", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "pow_57", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mean_56", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "add_140", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_140", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "rsqrt_56", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_790", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_789", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_790", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_196", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_253", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_788", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_196", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_788", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_197", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_197", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_673", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_254", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "dtype_cast_254", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_254", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "permute_308", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_673", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_791", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_308", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "alias_default_792", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_791", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_792", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "einsum_default_196", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_255", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "dtype_cast_255", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_255", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "permute_309", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_309", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "alias_default_793", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_791", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_793", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "einsum_default_197", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_256", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "dtype_cast_256", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_256", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "permute_310", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_310", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "alias_default_794", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_791", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_794", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "einsum_default_198", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_196", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_706", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_707", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_198", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_708", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_706", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_680", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_680", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_709", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_709", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_complex_56", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_707", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_681", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_681", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_710", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_710", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_complex_57", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_711", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_711", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_795", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_795", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "mul_198", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_198", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_real_56", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_712", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_795", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "mul_199", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_199", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_real_57", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_713", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_712", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_682", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_713", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_683", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_683", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "unsqueeze_56", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "expand_56", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "clone_56", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_714", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_708", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "unsqueeze_57", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "expand_57", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "clone_57", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_715", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_682", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_311", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_714", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_312", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_715", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_313", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_796", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_312", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_797", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_313", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_798", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_796", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_797", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_798", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_28", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_252", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_253", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_28", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_258", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_28", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_259", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_252", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "alias_default_799", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_799", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_314", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_314", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_716", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_257", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "dtype_cast_257", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_257", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "permute_315", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_716", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_800", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_315", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "alias_default_801", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_800", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_801", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "einsum_default_199", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_787", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_199", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28", - "name": "add_141", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_262", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "dtype_cast_258", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28", - "name": "alias_default_802", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_802", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_686", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_686", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_804", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_804", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "pow_58", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_58", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mean_57", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "add_142", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_142", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "rsqrt_57", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_805", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_804", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_805", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_200", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_258", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_803", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_200", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_803", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_201", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_201", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_687", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_258", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "dtype_cast_259", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_259", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "permute_316", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_687", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_806", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_316", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "alias_default_807", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_806", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_807", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "einsum_default_200", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_200", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "alias_default_808", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_808", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "convert_element_type_690", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_690", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_809", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_809", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "neg_28", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "exp_28", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "add_143", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_809", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_143", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "div_28", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "convert_element_type_691", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_260", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "dtype_cast_260", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_260", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "permute_317", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_317", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "alias_default_811", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_806", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_811", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "einsum_default_201", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_691", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_810", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_201", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "alias_default_812", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_810", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_812", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_202", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_259", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "dtype_cast_261", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_261", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "permute_318", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_202", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_813", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_318", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "alias_default_814", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_813", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_814", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "einsum_default_202", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_802", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_202", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28", - "name": "add_144", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_270", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "dtype_cast_262", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_144", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28", - "name": "alias_default_815", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_815", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_696", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_696", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_817", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_817", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "pow_59", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mean_58", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_58", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "add_145", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "rsqrt_58", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_58", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_818", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_817", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_818", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_203", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_262", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_816", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_203", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_816", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_204", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_204", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_697", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_263", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "dtype_cast_263", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_263", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "permute_319", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_697", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_819", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_319", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "alias_default_820", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_819", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_820", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "einsum_default_203", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_264", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "dtype_cast_264", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_264", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "permute_320", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_320", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "alias_default_821", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_819", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_821", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "einsum_default_204", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_265", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "dtype_cast_265", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_265", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "permute_321", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_321", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "alias_default_822", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_819", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_822", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "einsum_default_205", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_203", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_731", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_204", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_732", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_205", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_733", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_731", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_704", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_704", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_734", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_734", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_complex_58", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_732", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_705", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_705", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_735", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_735", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_complex_59", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_736", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_736", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_823", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_823", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "mul_205", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_205", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_real_58", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_737", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_823", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "mul_206", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_206", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_real_59", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_738", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_737", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_706", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_738", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_707", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_707", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "unsqueeze_58", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "expand_58", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "clone_58", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_739", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_733", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "unsqueeze_59", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "expand_59", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "clone_59", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_740", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_706", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_322", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_739", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_323", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_740", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_324", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_824", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_825", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_324", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_826", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_824", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_825", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_826", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_29", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_261", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_262", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_29", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_267", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_29", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_268", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_261", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "alias_default_827", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_827", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_325", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_325", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_741", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_266", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "dtype_cast_266", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_266", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "permute_326", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_741", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_828", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_326", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "alias_default_829", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_828", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_829", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "einsum_default_206", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_815", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_206", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29", - "name": "add_146", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_271", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "dtype_cast_267", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29", - "name": "alias_default_830", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_830", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_710", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_710", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_832", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_832", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "pow_60", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mean_59", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "add_147", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "rsqrt_59", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_833", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_832", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_833", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_207", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_267", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_831", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_207", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_831", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_208", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_208", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_711", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_267", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "dtype_cast_268", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_268", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "permute_327", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_711", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_834", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_327", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "alias_default_835", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_834", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_835", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "einsum_default_207", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_207", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "alias_default_836", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_836", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "convert_element_type_714", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_714", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_837", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_837", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "neg_29", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "exp_29", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "add_148", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_837", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_148", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "div_29", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "convert_element_type_715", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_269", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "dtype_cast_269", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_269", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "permute_328", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_328", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "alias_default_839", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_834", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_839", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "einsum_default_208", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_715", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_838", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_208", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "alias_default_840", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_838", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_840", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_209", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "dtype_cast_270", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_270", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "permute_329", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_209", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_841", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_329", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "alias_default_842", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_841", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_842", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "einsum_default_209", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_830", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_209", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29", - "name": "add_149", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_279", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "dtype_cast_271", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_149", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29", - "name": "alias_default_843", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_843", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_720", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_720", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_845", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_845", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "pow_61", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mean_60", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "add_150", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_150", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "rsqrt_60", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_846", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_845", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_846", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_210", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_271", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_844", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_210", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_844", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_211", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_211", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_721", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_272", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "dtype_cast_272", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_272", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "permute_330", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_721", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_847", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_330", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "alias_default_848", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_847", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_848", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "einsum_default_210", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_273", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "dtype_cast_273", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_273", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "permute_331", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_331", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "alias_default_849", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_847", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_849", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "einsum_default_211", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_274", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "dtype_cast_274", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_274", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "permute_332", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_332", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "alias_default_850", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_847", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_850", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "einsum_default_212", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_210", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_756", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_211", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_757", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_212", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_758", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_756", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_728", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_728", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_759", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_759", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_complex_60", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_757", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_729", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_729", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_760", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_760", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_complex_61", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_761", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_761", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_851", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_851", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "mul_212", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_212", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_real_60", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_762", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_851", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "mul_213", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_213", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_real_61", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_763", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_762", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_730", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_763", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_731", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_731", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "unsqueeze_60", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "expand_60", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "clone_60", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_764", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_758", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "unsqueeze_61", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "expand_61", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "clone_61", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_765", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_730", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_333", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_764", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_334", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_765", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_335", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_852", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_334", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_853", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_335", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_854", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_852", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_853", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_854", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_30", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_270", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_271", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_30", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_276", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_30", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_277", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_270", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "alias_default_855", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_855", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_336", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_336", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_766", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_275", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "dtype_cast_275", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_275", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "permute_337", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_766", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_856", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_337", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "alias_default_857", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_856", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_857", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "einsum_default_213", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_843", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_213", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30", - "name": "add_151", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_280", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "dtype_cast_276", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_151", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30", - "name": "alias_default_858", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_858", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_734", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_734", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_860", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_860", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "pow_62", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mean_61", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "add_152", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_152", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "rsqrt_61", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_861", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_860", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_861", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_214", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_276", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_859", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_859", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_215", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_215", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_735", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_276", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "dtype_cast_277", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_277", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "permute_338", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_735", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_862", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_338", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "alias_default_863", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_862", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_863", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "einsum_default_214", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_214", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "alias_default_864", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_864", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "convert_element_type_738", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_738", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_865", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_865", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "neg_30", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "exp_30", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "add_153", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_865", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_153", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "div_30", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "convert_element_type_739", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_278", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "dtype_cast_278", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_278", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "permute_339", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_339", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "alias_default_867", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_862", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_867", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "einsum_default_215", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_739", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_866", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_215", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "alias_default_868", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_866", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_868", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_216", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_277", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "dtype_cast_279", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_279", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "permute_340", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_216", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_869", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_340", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "alias_default_870", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_869", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_870", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "einsum_default_216", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_858", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_216", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30", - "name": "add_154", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 0, - "cluster_root": "dtype_cast_1", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_288", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "dtype_cast_280", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30", - "name": "alias_default_871", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 1, - "cluster_root": "convert_element_type", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_871", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_744", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 2, - "cluster_root": "alias_default_5", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_744", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_873", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 3, - "cluster_root": "pow_1", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_873", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "pow_63", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 4, - "cluster_root": "mean", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mean_62", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 5, - "cluster_root": "add", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "add_155", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 6, - "cluster_root": "rsqrt", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_155", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "rsqrt_62", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 7, - "cluster_root": "alias_default_6", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_874", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 8, - "cluster_root": "mul", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_873", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_874", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_217", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 9, - "cluster_root": "alias_default_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_280", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_872", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 10, - "cluster_root": "mul_1", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_217", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_872", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_218", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 11, - "cluster_root": "convert_element_type_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_218", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_745", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 12, - "cluster_root": "dtype_cast_2", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_281", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "dtype_cast_281", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 13, - "cluster_root": "permute", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 93.01059422750424, - "dst_placement": "RS(0)", - "name": "dtype_cast_281", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "permute_341", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 14, - "cluster_root": "alias_default_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_745", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_875", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 15, - "cluster_root": "alias_default_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_341", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "alias_default_876", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 16, - "cluster_root": "einsum_default", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_875", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_876", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "einsum_default_217", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 17, - "cluster_root": "dtype_cast_3", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_282", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "dtype_cast_282", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 18, - "cluster_root": "permute_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 75.93123841862722, - "dst_placement": "RR", - "name": "dtype_cast_282", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "permute_342", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 19, - "cluster_root": "alias_default_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_342", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "alias_default_877", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 20, - "cluster_root": "einsum_default_1", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_875", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_877", - "src_placement": "RR", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "einsum_default_218", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 21, - "cluster_root": "dtype_cast_4", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_283", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "dtype_cast_283", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 22, - "cluster_root": "permute_2", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 39.60264855687606, - "dst_placement": "RS(0)", - "name": "dtype_cast_283", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "permute_343", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 23, - "cluster_root": "alias_default_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_343", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "alias_default_878", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 24, - "cluster_root": "einsum_default_2", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_875", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_878", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "einsum_default_219", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 25, - "cluster_root": "view_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_217", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_781", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 26, - "cluster_root": "view_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_218", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_782", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 27, - "cluster_root": "view_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_219", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_783", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 28, - "cluster_root": "convert_element_type_8", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_781", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_752", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 29, - "cluster_root": "view_9", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_752", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_784", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 30, - "cluster_root": "view_as_complex", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_784", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_complex_62", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 31, - "cluster_root": "convert_element_type_9", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_782", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_753", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 32, - "cluster_root": "view_10", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_753", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_785", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 33, - "cluster_root": "view_as_complex_1", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_785", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_complex_63", - "op": "aten.view_as_complex.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 34, - "cluster_root": "view_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_786", - "op": "aten.view.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 35, - "cluster_root": "alias_default_11", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "view_786", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_879", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "return freqs_cis.view(*shape)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "reshape_for_broadcast", - "line": 183 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 36, - "cluster_root": "mul_2", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_879", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "mul_219", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 37, - "cluster_root": "view_as_real", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_219", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_real_62", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 38, - "cluster_root": "view_12", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_787", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 39, - "cluster_root": "mul_3", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_879", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "mul_220", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 40, - "cluster_root": "view_as_real_1", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_220", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_real_63", - "op": "aten.view_as_real.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 41, - "cluster_root": "view_13", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_788", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 42, - "cluster_root": "convert_element_type_10", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_787", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_754", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 43, - "cluster_root": "convert_element_type_11", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_788", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_755", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 44, - "cluster_root": "unsqueeze", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_755", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "unsqueeze_62", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 45, - "cluster_root": "expand", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "expand_62", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 46, - "cluster_root": "clone", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "clone_62", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 47, - "cluster_root": "view_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_789", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 48, - "cluster_root": "unsqueeze_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_783", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "unsqueeze_63", - "op": "aten.unsqueeze.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 49, - "cluster_root": "expand_1", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "unsqueeze_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "expand_63", - "op": "aten.expand.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 50, - "cluster_root": "clone_1", - "compute_cost": 26.027785181236673, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "expand_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "clone_63", - "op": "aten.clone.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 51, - "cluster_root": "view_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "clone_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_790", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 52, - "cluster_root": "permute_3", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_754", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_344", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 53, - "cluster_root": "permute_4", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_789", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_345", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 54, - "cluster_root": "permute_5", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_790", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_346", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 55, - "cluster_root": "alias_default_12", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_344", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_880", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 56, - "cluster_root": "alias_default_13", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_345", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_881", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 57, - "cluster_root": "alias_default_14", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_346", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_882", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 58, - "cluster_root": "_scaled_dot_product_flash_attention", - "compute_cost": 794.1005545110502, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_880", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_881", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_882", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_31", - "op": "aten._scaled_dot_product_flash_attention.default", - "phase": "forward", - "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 59, - "cluster_root": "getitem", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_279", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_280", - "op": "", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_31", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_285", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [ - 2 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "uint64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_scaled_dot_product_flash_attention_31", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_286", - "op": "", - "phase": "forward", - "placement": "RR", - "shape": [], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 60, - "cluster_root": "alias_default_15", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_279", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "alias_default_883", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 61, - "cluster_root": "permute_6", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_883", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_347", - "op": "aten.permute.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 62, - "cluster_root": "view_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "permute_347", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_791", - "op": "aten.view.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 63, - "cluster_root": "dtype_cast_5", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_284", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "dtype_cast_284", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 64, - "cluster_root": "permute_7", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 187.32495367450883, - "dst_placement": "RR", - "name": "dtype_cast_284", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "permute_348", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 65, - "cluster_root": "alias_default_16", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "view_791", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_884", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 66, - "cluster_root": "alias_default_17", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_348", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "alias_default_885", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 67, - "cluster_root": "einsum_default_3", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_884", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_885", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "einsum_default_220", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 68, - "cluster_root": "add_1", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_871", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_220", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31", - "name": "add_156", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 69, - "cluster_root": "dtype_cast_6", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_289", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "dtype_cast_285", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 70, - "cluster_root": "alias_default_18", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31", - "name": "alias_default_886", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 419 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 71, - "cluster_root": "convert_element_type_14", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_886", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_758", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 72, - "cluster_root": "alias_default_20", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_758", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_888", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 73, - "cluster_root": "pow_2", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_888", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "pow_64", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 74, - "cluster_root": "mean_1", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mean_63", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 75, - "cluster_root": "add_2", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "add_157", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 76, - "cluster_root": "rsqrt_1", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "rsqrt_63", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 77, - "cluster_root": "alias_default_21", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_889", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 78, - "cluster_root": "mul_4", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_888", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_889", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_221", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 79, - "cluster_root": "alias_default_19", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_285", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_887", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 80, - "cluster_root": "mul_5", - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_221", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_887", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_222", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 81, - "cluster_root": "convert_element_type_15", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_222", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_759", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 82, - "cluster_root": "dtype_cast_7", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_285", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "dtype_cast_286", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 83, - "cluster_root": "permute_8", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_286", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "permute_349", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 84, - "cluster_root": "alias_default_22", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_759", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_890", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 85, - "cluster_root": "alias_default_23", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_349", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "alias_default_891", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 86, - "cluster_root": "einsum_default_4", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_890", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_891", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "einsum_default_221", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 87, - "cluster_root": "alias_default_24", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_221", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "alias_default_892", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 88, - "cluster_root": "convert_element_type_18", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_892", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "convert_element_type_762", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 89, - "cluster_root": "alias_default_25", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_762", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_893", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 90, - "cluster_root": "neg", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_893", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "neg_31", - "op": "aten.neg.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 91, - "cluster_root": "exp", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "exp_31", - "op": "aten.exp.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 92, - "cluster_root": "add_3", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "add_158", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 93, - "cluster_root": "div", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_893", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_158", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "div_31", - "op": "aten.div.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 94, - "cluster_root": "convert_element_type_19", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "div_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "convert_element_type_763", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 95, - "cluster_root": "dtype_cast_8", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_287", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "dtype_cast_287", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 96, - "cluster_root": "permute_9", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(0)", - "name": "dtype_cast_287", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "permute_350", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 97, - "cluster_root": "alias_default_27", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_350", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "alias_default_895", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 98, - "cluster_root": "einsum_default_5", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_890", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_895", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "einsum_default_222", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 99, - "cluster_root": "alias_default_26", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_763", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_894", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 100, - "cluster_root": "alias_default_28", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_222", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "alias_default_896", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 101, - "cluster_root": "mul_6", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_894", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_896", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_223", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 102, - "cluster_root": "dtype_cast_9", - "compute_cost": 8.540367012593283, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "primals_286", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "dtype_cast_288", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 103, - "cluster_root": "permute_10", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 258.576, - "dst_placement": "RS(1)", - "name": "dtype_cast_288", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "permute_351", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 104, - "cluster_root": "alias_default_29", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_223", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_897", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 105, - "cluster_root": "alias_default_30", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_351", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "alias_default_898", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 106, - "cluster_root": "einsum_default_6", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_897", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_898", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "einsum_default_223", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 107, - "cluster_root": "add_4", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_886", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_223", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31", - "name": "add_159", - "op": "aten.add.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_290", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "dtype_cast_289", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 108, - "cluster_root": "alias_default_31", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_159", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31", - "name": "alias_default_899", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "out = h + self.feed_forward(self.ffn_norm(h))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 420 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_899", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_768", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_768", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_901", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_901", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "pow_65", - "op": "aten.pow.Tensor_Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "pow_65", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mean_64", - "op": "aten.mean.dim", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mean_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "add_160", - "op": "aten.add.Scalar", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_160", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "rsqrt_64", - "op": "aten.rsqrt.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "rsqrt_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_902", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_901", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_902", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_224", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 28.358260191421483, - "dst_placement": "RR", - "name": "dtype_cast_289", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_900", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 52.058747582344104, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_224", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_900", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_225", - "op": "aten.mul.Tensor", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_225", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_769", - "op": "prims.convert_element_type.default", - "phase": "forward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 76.40578345195063, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(0)", - "name": "primals_291", - "src_placement": "S(0)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "dtype_cast_290", - "op": "autoparallel.dtype_cast.default", - "phase": "forward", - "placement": "S(0)S(0)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 2081.296, - "dst_placement": "RS(0)", - "name": "dtype_cast_290", - "src_placement": "S(0)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].output", - "name": "permute_352", - "op": "aten.permute.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 128256 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_769", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_903", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_352", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "alias_default_904", - "op": "aten.alias.default", - "phase": "forward", - "placement": "RS(1)", - "shape": [ - 4096, - 128256 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 6216.318403281814, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_903", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_904", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "einsum_default_224", - "op": "aten.einsum.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 128256 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "alias_default_1420", - "op": "aten.alias.default", - "phase": "forward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 128256 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "tangents_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "alias_default_2", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 128256 - ], - "transition_cost": 0.0 - }, - { - "compute_cost": 6216.318403281814, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_903", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "einsum_default_225", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 128256 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_904", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "permute_355", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 6216.318403281814, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_355", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "einsum_default_226", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_225", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "permute_356", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "output = self.output(h) if self.output else h", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 545 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 305.6231338078025, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_356", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].output", - "name": "dtype_cast_291", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 4133.392, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_291", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].output", - "name": "alias_default_1711", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_226", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_776", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_899", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_777", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_900", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_778", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_776", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_905", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_905", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_778", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_226", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_777", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_902", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_227", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_226", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_906", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_907", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_907", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_906", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_228", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_228", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "sum_1", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_907", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "div_32", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_229", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_906", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_229", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "sub", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_902", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_230", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_905", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_907", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "mul_231", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_231", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "sum_2", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_230", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_779", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_2", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "convert_element_type_780", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_780", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].norm", - "name": "dtype_cast_292", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_292", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_1710", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "convert_element_type_779", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].norm", - "name": "alias_default_908", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_908", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_897", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "einsum_default_227", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_898", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "permute_359", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_908", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_359", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "einsum_default_228", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_227", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "permute_360", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_360", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "dtype_cast_293", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_293", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "alias_default_1706", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_228", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w2", - "name": "alias_default_909", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_909", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_894", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_232", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_909", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_896", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_233", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_232", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_910", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_910", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_890", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "einsum_default_229", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_895", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "permute_363", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_910", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_363", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "einsum_default_230", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_229", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "permute_364", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_364", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "dtype_cast_294", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_294", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w3", - "name": "alias_default_1707", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_233", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "convert_element_type_789", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_892", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "convert_element_type_790", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_790", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_911", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_911", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "neg_32", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "exp_32", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "add_161", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_161", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "reciprocal", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_234", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_234", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_912", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_789", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_912", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_235", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_912", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "sub_1", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_911", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_236", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_236", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "add_162", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_235", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_162", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "mul_237", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_237", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "convert_element_type_791", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_791", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward", - "name": "alias_default_913", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_913", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_890", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "einsum_default_231", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_891", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "permute_367", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_913", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_367", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "einsum_default_232", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_230", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_232", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31", - "name": "add_163", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_231", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "permute_368", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_368", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "dtype_cast_295", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_295", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.feed_forward.w1", - "name": "alias_default_1705", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_163", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_796", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_886", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_797", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_887", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_798", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_796", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_914", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_914", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_798", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_238", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_797", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_889", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_239", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_238", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_915", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_239", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_916", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_916", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_915", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_240", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_240", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "sum_3", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_916", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "div_33", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_241", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_915", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_241", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "sub_2", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_889", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_242", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_914", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_916", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "mul_243", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_243", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "sum_4", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_799", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_4", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "convert_element_type_800", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_908", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_799", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "name": "add_164", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_800", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "dtype_cast_296", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_296", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.ffn_norm", - "name": "alias_default_1709", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_164", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "alias_default_917", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_917", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_884", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "einsum_default_233", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_885", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "permute_371", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_917", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_371", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "einsum_default_234", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_233", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "permute_372", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_372", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "dtype_cast_297", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_297", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wo", - "name": "alias_default_1704", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_234", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_812", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_812", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_373", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_373", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_880", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_881", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_882", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_883", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_280", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_285", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_286", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_288", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_289", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.sdpa", - "name": "getitem_290", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_290", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_374", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_289", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_375", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_288", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "permute_376", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_374", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_813", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_813", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "sum_5", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "squeeze", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_375", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_814", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_814", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "sum_6", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "squeeze_1", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_805", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_376", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_806", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_805", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_815", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_815", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_complex_64", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_879", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "_conj", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "clone_70", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_64", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_70", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "mul_244", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_806", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_816", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_816", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_complex_65", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_879", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "_conj_1", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_1", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "clone_71", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_65", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_71", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "mul_245", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_244", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_real_64", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_64", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_817", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_817", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_807", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_245", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_as_real_65", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_65", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_818", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_818", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "convert_element_type_808", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_819", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_807", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_820", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_808", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "view_821", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_819", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_918", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_918", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_875", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "einsum_default_235", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_878", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "permute_379", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_918", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_379", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "einsum_default_236", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_235", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "permute_380", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_380", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "dtype_cast_298", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_298", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wv", - "name": "alias_default_1703", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_820", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_919", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_919", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_875", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "einsum_default_237", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_877", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "permute_383", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_919", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_383", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "einsum_default_238", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_238", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "add_165", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_237", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "permute_384", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_384", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "dtype_cast_299", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_299", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wk", - "name": "alias_default_1702", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_821", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention", - "name": "alias_default_920", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_920", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_875", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "einsum_default_239", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_876", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "permute_387", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_920", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_387", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "einsum_default_240", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_165", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_240", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31", - "name": "add_166", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_239", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "permute_388", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_388", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "dtype_cast_300", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_300", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention.wq", - "name": "alias_default_1701", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_166", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_821", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_871", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_822", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_872", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_823", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_821", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_921", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_921", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_823", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_246", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_822", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_874", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_247", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_246", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_922", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_247", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_923", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_923", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_922", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_248", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_248", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "sum_7", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_923", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "div_34", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_249", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_922", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_249", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "sub_3", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_874", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_250", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_921", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_923", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "mul_251", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_251", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "sum_8", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_250", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_824", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_8", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "convert_element_type_825", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_917", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_824", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "add_167", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_825", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "dtype_cast_301", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_301", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.31.attention_norm", - "name": "alias_default_1708", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_167", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "alias_default_924", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_924", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_869", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "einsum_default_241", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_870", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "permute_391", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_924", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_391", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "einsum_default_242", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_241", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "permute_392", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_392", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "dtype_cast_302", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_302", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "alias_default_1697", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_242", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w2", - "name": "alias_default_925", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_925", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_866", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_252", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_925", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_868", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_253", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_252", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_926", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_926", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_862", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "einsum_default_243", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_867", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "permute_395", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_926", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_395", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "einsum_default_244", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_243", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "permute_396", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_396", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "dtype_cast_303", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_303", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w3", - "name": "alias_default_1698", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_253", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "convert_element_type_834", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_864", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "convert_element_type_835", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_835", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_927", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_927", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "neg_33", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "exp_33", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "add_168", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "reciprocal_1", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_1", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_254", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_254", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_928", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_834", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_928", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_255", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_928", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "sub_4", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_927", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_256", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_256", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "add_169", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_255", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_169", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "mul_257", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_257", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "convert_element_type_836", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_836", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward", - "name": "alias_default_929", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_929", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_862", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "einsum_default_245", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_863", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "permute_399", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_929", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_399", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "einsum_default_246", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_244", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_246", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30", - "name": "add_170", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_245", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "permute_400", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_400", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "dtype_cast_304", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_304", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.feed_forward.w1", - "name": "alias_default_1696", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_170", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_841", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_858", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_842", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_859", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_843", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_841", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_930", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_930", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_843", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_258", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_842", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_861", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_259", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_931", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_259", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_932", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_932", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_931", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_260", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_260", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "sum_9", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_932", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "div_35", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_261", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_931", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_261", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "sub_5", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_861", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_262", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_930", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_932", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "mul_263", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_263", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "sum_10", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_262", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_844", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_10", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "convert_element_type_845", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_924", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_844", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "add_171", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_845", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "dtype_cast_305", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_305", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.ffn_norm", - "name": "alias_default_1700", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "alias_default_933", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_933", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_856", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "einsum_default_247", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_857", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "permute_403", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_933", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_403", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "einsum_default_248", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_247", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "permute_404", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_404", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "dtype_cast_306", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_306", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wo", - "name": "alias_default_1695", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_248", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_836", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_836", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_405", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_405", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_852", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_853", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_854", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_855", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_271", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_276", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_277", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_1", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_291", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_292", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.sdpa", - "name": "getitem_293", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_293", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_406", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_292", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_407", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "permute_408", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_406", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_837", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_837", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "sum_11", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "squeeze_2", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_407", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_838", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_838", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "sum_12", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "squeeze_3", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_850", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_408", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_851", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_850", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_839", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_839", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_complex_66", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_851", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "_conj_2", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_2", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "clone_78", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_66", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_78", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "mul_264", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_851", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_840", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_840", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_complex_67", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_851", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "_conj_3", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_3", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "clone_79", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_67", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_79", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "mul_265", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_264", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_real_66", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_66", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_841", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_841", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_852", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_265", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_as_real_67", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_67", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_842", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_842", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "convert_element_type_853", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_843", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_852", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_844", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_853", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "view_845", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_843", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_934", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_934", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_847", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "einsum_default_249", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_850", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "permute_411", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_934", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_411", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "einsum_default_250", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_249", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "permute_412", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_412", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "dtype_cast_307", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_307", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wv", - "name": "alias_default_1694", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_844", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_935", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_935", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_847", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "einsum_default_251", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_849", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "permute_415", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_935", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_415", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "einsum_default_252", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_250", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_252", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "add_172", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_251", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "permute_416", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_416", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "dtype_cast_308", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_308", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wk", - "name": "alias_default_1693", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_845", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention", - "name": "alias_default_936", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_936", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_847", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "einsum_default_253", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_848", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "permute_419", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_936", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_419", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "einsum_default_254", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_254", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30", - "name": "add_173", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_253", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "permute_420", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_420", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "dtype_cast_309", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_309", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention.wq", - "name": "alias_default_1692", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_866", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_843", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_867", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_844", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_868", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_866", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_937", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_937", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_868", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_266", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_867", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_846", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_267", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_938", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_267", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_939", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_939", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_938", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_268", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "sum_13", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_939", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "div_36", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_269", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_938", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "sub_6", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_846", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_270", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_937", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_939", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "mul_271", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_271", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "sum_14", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_270", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_869", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_14", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "convert_element_type_870", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_933", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_869", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "add_174", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_870", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "dtype_cast_310", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_310", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.30.attention_norm", - "name": "alias_default_1699", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_174", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "alias_default_940", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_940", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_841", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "einsum_default_255", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_842", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "permute_423", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_940", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_423", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "einsum_default_256", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_255", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "permute_424", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_424", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "dtype_cast_311", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_311", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "alias_default_1688", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_256", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w2", - "name": "alias_default_941", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_941", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_838", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_272", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_941", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_840", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_273", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_272", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_942", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_942", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_834", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "einsum_default_257", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_839", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "permute_427", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_942", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_427", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "einsum_default_258", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_257", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "permute_428", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_428", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "dtype_cast_312", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_312", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w3", - "name": "alias_default_1689", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_273", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "convert_element_type_879", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_836", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "convert_element_type_880", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_880", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_943", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_943", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "neg_34", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "exp_34", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "add_175", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_175", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "reciprocal_2", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_2", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_274", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_274", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_944", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_879", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_944", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_275", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_944", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "sub_7", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_943", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_276", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_276", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "add_176", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_275", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_176", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "mul_277", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_277", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "convert_element_type_881", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_881", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward", - "name": "alias_default_945", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_945", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_834", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "einsum_default_259", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_835", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "permute_431", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_945", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_431", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "einsum_default_260", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_258", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_260", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29", - "name": "add_177", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_259", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "permute_432", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_432", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "dtype_cast_313", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_313", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.feed_forward.w1", - "name": "alias_default_1687", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_177", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_886", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_830", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_887", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_831", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_888", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_886", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_946", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_946", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_888", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_278", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_887", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_833", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_279", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_278", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_947", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_279", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_948", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_948", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_947", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_280", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_280", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "sum_15", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_948", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "div_37", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_281", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_947", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_281", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "sub_8", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_833", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_282", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_946", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_948", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "mul_283", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "sum_16", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_282", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_889", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_16", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "convert_element_type_890", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_940", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_889", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "add_178", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_890", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "dtype_cast_314", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_314", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.ffn_norm", - "name": "alias_default_1691", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_178", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "alias_default_949", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_949", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_828", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "einsum_default_261", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_829", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "permute_435", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_949", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_435", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "einsum_default_262", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_261", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "permute_436", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_436", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "dtype_cast_315", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_315", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wo", - "name": "alias_default_1686", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_262", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_860", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_860", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_437", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_437", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_824", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_825", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_826", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_827", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_262", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_267", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_268", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_2", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_294", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_295", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_2", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.sdpa", - "name": "getitem_296", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_296", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_438", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_295", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_439", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_294", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "permute_440", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_438", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_861", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_861", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "sum_17", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "squeeze_4", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_439", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_862", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_862", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "sum_18", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "squeeze_5", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_895", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_440", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_896", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_895", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_863", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_863", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_complex_68", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_823", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "_conj_4", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_4", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "clone_86", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_68", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_86", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "mul_284", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_896", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_864", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_864", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_complex_69", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_823", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "_conj_5", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_5", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "clone_87", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_69", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_87", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "mul_285", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_284", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_real_68", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_68", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_865", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_865", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_897", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_285", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_as_real_69", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_69", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_866", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_866", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "convert_element_type_898", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_867", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_897", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_868", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_898", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "view_869", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_867", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_950", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_950", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_819", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "einsum_default_263", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_822", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "permute_443", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_950", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_443", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "einsum_default_264", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_263", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "permute_444", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_444", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "dtype_cast_316", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_316", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wv", - "name": "alias_default_1685", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_868", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_951", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_951", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_819", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "einsum_default_265", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_821", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "permute_447", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_951", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_447", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "einsum_default_266", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_264", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "add_179", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_265", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "permute_448", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_448", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "dtype_cast_317", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_317", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wk", - "name": "alias_default_1684", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_869", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention", - "name": "alias_default_952", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_952", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_819", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "einsum_default_267", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_820", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "permute_451", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_952", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_451", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "einsum_default_268", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_179", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29", - "name": "add_180", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_267", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "permute_452", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_452", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "dtype_cast_318", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_318", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention.wq", - "name": "alias_default_1683", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_180", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_911", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_815", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_912", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_816", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_913", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_911", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_953", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_953", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_913", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_286", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_912", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_818", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_287", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_286", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_954", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_287", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_955", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_955", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_954", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_288", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_288", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "sum_19", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_955", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "div_38", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_289", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_954", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_289", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "sub_9", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_818", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_290", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_953", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_955", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "mul_291", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "sum_20", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_290", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_914", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_20", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "convert_element_type_915", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_949", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_914", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "add_181", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_915", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "dtype_cast_319", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_319", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.29.attention_norm", - "name": "alias_default_1690", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_181", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "alias_default_956", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_956", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_813", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "einsum_default_269", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_814", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "permute_455", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_956", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_455", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "einsum_default_270", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_269", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "permute_456", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_456", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "dtype_cast_320", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_320", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "alias_default_1679", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_270", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w2", - "name": "alias_default_957", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_957", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_810", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_292", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_957", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_812", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_293", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_292", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_958", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_958", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_806", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "einsum_default_271", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_811", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "permute_459", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_958", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_459", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "einsum_default_272", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_271", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "permute_460", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_460", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "dtype_cast_321", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_321", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w3", - "name": "alias_default_1680", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_293", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "convert_element_type_924", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_808", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "convert_element_type_925", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_925", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_959", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_959", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "neg_35", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "exp_35", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "add_182", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_182", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "reciprocal_3", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_3", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_294", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_294", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_960", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_924", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_960", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_295", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_960", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "sub_10", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_959", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_296", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_296", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "add_183", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_295", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_183", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "mul_297", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_297", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "convert_element_type_926", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_926", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward", - "name": "alias_default_961", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_961", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_806", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "einsum_default_273", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_807", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "permute_463", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_961", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_463", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "einsum_default_274", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_272", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_274", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28", - "name": "add_184", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_273", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "permute_464", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_464", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "dtype_cast_322", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_322", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.feed_forward.w1", - "name": "alias_default_1678", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_184", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_931", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_802", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_932", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_803", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_933", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_931", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_962", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_962", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_933", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_298", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_932", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_805", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_299", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_963", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_299", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_964", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_964", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_963", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_300", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "sum_21", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_964", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "div_39", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_301", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_963", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "sub_11", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_805", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_302", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_962", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_964", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "mul_303", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_303", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "sum_22", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_302", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_934", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_22", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "convert_element_type_935", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_956", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_934", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "add_185", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_935", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "dtype_cast_323", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_323", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.ffn_norm", - "name": "alias_default_1682", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_185", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "alias_default_965", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_965", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_800", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "einsum_default_275", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_801", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "permute_467", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_965", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_467", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "einsum_default_276", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_275", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "permute_468", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_468", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "dtype_cast_324", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_324", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wo", - "name": "alias_default_1677", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_276", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_884", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_884", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_469", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_469", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_796", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_797", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_798", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_799", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_253", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_258", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_259", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_3", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_297", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_298", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.sdpa", - "name": "getitem_299", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_299", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_470", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_471", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_297", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "permute_472", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_470", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_885", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_885", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "sum_23", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "squeeze_6", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_471", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_886", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_886", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "sum_24", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "squeeze_7", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_940", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_472", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_941", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_940", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_887", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_887", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_complex_70", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_795", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "_conj_6", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_6", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "clone_94", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_70", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_94", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "mul_304", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_941", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_888", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_888", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_complex_71", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_795", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "_conj_7", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_7", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "clone_95", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_71", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_95", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "mul_305", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_304", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_real_70", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_70", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_889", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_889", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_942", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_305", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_as_real_71", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_71", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_890", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_890", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "convert_element_type_943", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_891", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_942", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_892", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_943", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "view_893", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_891", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_966", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_966", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_791", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "einsum_default_277", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_794", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "permute_475", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_966", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_475", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "einsum_default_278", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_277", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "permute_476", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_476", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "dtype_cast_325", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_325", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wv", - "name": "alias_default_1676", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_892", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_967", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_967", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_791", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "einsum_default_279", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_793", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "permute_479", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_967", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_479", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "einsum_default_280", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_278", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_280", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "add_186", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_279", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "permute_480", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_480", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "dtype_cast_326", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_326", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wk", - "name": "alias_default_1675", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_893", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention", - "name": "alias_default_968", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_968", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_791", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "einsum_default_281", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_792", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "permute_483", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_968", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_483", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "einsum_default_282", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_282", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28", - "name": "add_187", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_281", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "permute_484", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_484", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "dtype_cast_327", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_327", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention.wq", - "name": "alias_default_1674", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_187", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_956", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_787", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_957", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_788", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_958", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_956", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_969", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_969", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_958", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_306", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_957", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_790", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_307", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_306", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_970", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_307", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_971", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_971", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_970", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_308", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_308", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "sum_25", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_971", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "div_40", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_309", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_970", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_309", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "sub_12", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_790", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_310", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_969", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_971", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "mul_311", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "sum_26", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_310", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_959", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_26", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "convert_element_type_960", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_965", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_959", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "add_188", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_960", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "dtype_cast_328", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_328", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.28.attention_norm", - "name": "alias_default_1681", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_188", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "alias_default_972", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_972", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_785", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "einsum_default_283", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_786", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "permute_487", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_972", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_487", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "einsum_default_284", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_283", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "permute_488", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_488", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "dtype_cast_329", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_329", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "alias_default_1670", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_284", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w2", - "name": "alias_default_973", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_973", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_782", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_312", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_973", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_784", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_313", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_312", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_974", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_974", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_778", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "einsum_default_285", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_783", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "permute_491", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_974", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_491", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "einsum_default_286", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_285", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "permute_492", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_492", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "dtype_cast_330", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_330", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w3", - "name": "alias_default_1671", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_313", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "convert_element_type_969", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_780", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "convert_element_type_970", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_970", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_975", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_975", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "neg_36", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "exp_36", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "add_189", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_189", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "reciprocal_4", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_4", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_314", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_314", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_976", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_969", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_976", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_315", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_976", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "sub_13", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_975", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_316", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_316", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "add_190", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_315", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_190", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "mul_317", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_317", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "convert_element_type_971", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_971", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward", - "name": "alias_default_977", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_977", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_778", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "einsum_default_287", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_779", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "permute_495", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_977", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_495", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "einsum_default_288", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_286", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_288", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27", - "name": "add_191", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_287", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "permute_496", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_496", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "dtype_cast_331", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_331", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.feed_forward.w1", - "name": "alias_default_1669", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_191", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_976", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_774", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_977", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_775", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_978", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_976", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_978", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_978", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_978", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_318", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_977", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_777", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_319", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_318", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_979", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_319", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_980", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_980", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_979", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_320", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_320", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "sum_27", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_980", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "div_41", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_321", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_979", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_321", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "sub_14", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_777", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_322", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_978", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_980", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "mul_323", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "sum_28", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_979", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_28", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "convert_element_type_980", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_972", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_979", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "add_192", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_980", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "dtype_cast_332", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_332", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.ffn_norm", - "name": "alias_default_1673", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_192", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "alias_default_981", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_981", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_772", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "einsum_default_289", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_773", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "permute_499", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_981", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_499", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "einsum_default_290", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_289", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "permute_500", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_500", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "dtype_cast_333", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_333", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wo", - "name": "alias_default_1668", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_290", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_908", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_908", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_501", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_501", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_768", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_769", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_770", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_771", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_244", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_249", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_250", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_4", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_300", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_301", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_4", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.sdpa", - "name": "getitem_302", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_302", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_502", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_503", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "permute_504", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_502", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_909", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_909", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "sum_29", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "squeeze_8", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_503", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_910", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_910", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "sum_30", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "squeeze_9", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_985", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_504", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_986", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_985", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_911", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_911", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_complex_72", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_767", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "_conj_8", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_8", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "clone_102", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_72", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_102", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "mul_324", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_986", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_912", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_912", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_complex_73", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_767", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "_conj_9", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_9", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "clone_103", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_73", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_103", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "mul_325", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_324", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_real_72", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_72", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_913", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_913", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_987", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_325", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_as_real_73", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_73", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_914", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_914", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "convert_element_type_988", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_915", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_987", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_916", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_988", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "view_917", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_915", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_982", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_982", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_763", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "einsum_default_291", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_766", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "permute_507", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_982", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_507", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "einsum_default_292", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_291", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "permute_508", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_508", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "dtype_cast_334", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_334", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wv", - "name": "alias_default_1667", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_916", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_983", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_983", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_763", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "einsum_default_293", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_765", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "permute_511", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_983", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_511", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "einsum_default_294", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_292", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_294", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "add_193", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_293", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "permute_512", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_512", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "dtype_cast_335", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_335", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wk", - "name": "alias_default_1666", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_917", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention", - "name": "alias_default_984", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_984", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_763", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "einsum_default_295", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_764", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "permute_515", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_984", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_515", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "einsum_default_296", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_193", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_296", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27", - "name": "add_194", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_295", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "permute_516", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_516", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "dtype_cast_336", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_336", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention.wq", - "name": "alias_default_1665", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_194", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_1001", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_759", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_1002", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_760", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_1003", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1001", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_985", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_985", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1003", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_326", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1002", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_762", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_327", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_986", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_327", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_987", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_987", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_986", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_328", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_328", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "sum_31", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_987", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "div_42", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_329", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_986", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_329", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "sub_15", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_762", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_330", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_985", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_987", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "mul_331", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_331", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "sum_32", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_330", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_1004", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_32", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "convert_element_type_1005", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_981", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1004", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "add_195", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1005", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "dtype_cast_337", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_337", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.27.attention_norm", - "name": "alias_default_1672", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_195", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "alias_default_988", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_988", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_757", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "einsum_default_297", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_758", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "permute_519", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_988", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_519", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "einsum_default_298", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_297", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "permute_520", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_520", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "dtype_cast_338", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_338", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "alias_default_1661", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_298", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w2", - "name": "alias_default_989", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_989", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_754", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_332", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_989", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_756", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_333", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_332", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_990", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_990", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_750", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "einsum_default_299", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_755", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "permute_523", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_990", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_523", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "einsum_default_300", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_299", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "permute_524", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_524", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "dtype_cast_339", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_339", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w3", - "name": "alias_default_1662", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_333", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "convert_element_type_1014", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_752", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "convert_element_type_1015", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1015", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_991", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_991", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "neg_37", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "exp_37", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "add_196", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_196", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "reciprocal_5", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_5", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_334", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_334", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_992", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1014", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_992", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_335", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_992", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "sub_16", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_991", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_336", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_336", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "add_197", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_335", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "mul_337", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_337", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "convert_element_type_1016", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1016", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward", - "name": "alias_default_993", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_993", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_750", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "einsum_default_301", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_751", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "permute_527", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_993", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_527", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "einsum_default_302", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_300", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_302", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26", - "name": "add_198", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_301", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "permute_528", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_528", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "dtype_cast_340", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_340", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.feed_forward.w1", - "name": "alias_default_1660", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_198", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_1021", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_746", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_1022", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_747", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_1023", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1021", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_994", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_994", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1023", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_338", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1022", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_749", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_339", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_338", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_995", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_996", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_996", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_995", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_340", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_340", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "sum_33", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_996", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "div_43", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_341", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_995", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_341", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "sub_17", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_749", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_342", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_994", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_996", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "mul_343", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_343", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "sum_34", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_342", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_1024", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_34", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "convert_element_type_1025", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_988", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1024", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "add_199", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1025", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "dtype_cast_341", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_341", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.ffn_norm", - "name": "alias_default_1664", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_199", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "alias_default_997", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_997", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_744", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "einsum_default_303", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_745", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "permute_531", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_997", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_531", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "einsum_default_304", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_303", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "permute_532", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_532", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "dtype_cast_342", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_342", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wo", - "name": "alias_default_1659", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_304", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_932", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_932", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_533", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_533", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_740", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_741", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_742", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_743", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_235", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_240", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_241", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_5", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_303", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_304", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_5", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.sdpa", - "name": "getitem_305", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_305", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_534", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_304", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_535", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_303", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "permute_536", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_534", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_933", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_933", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "sum_35", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "squeeze_10", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_535", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_934", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_934", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "sum_36", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "squeeze_11", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_1030", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_536", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_1031", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1030", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_935", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_935", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_complex_74", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_739", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "_conj_10", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_10", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "clone_110", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_74", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_110", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "mul_344", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1031", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_936", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_936", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_complex_75", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_739", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "_conj_11", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "clone_111", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_75", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_111", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "mul_345", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_344", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_real_74", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_74", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_937", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_937", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_1032", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_345", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_as_real_75", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_75", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_938", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_938", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "convert_element_type_1033", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_939", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1032", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_940", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1033", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "view_941", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_939", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_998", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_998", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_735", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "einsum_default_305", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_738", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "permute_539", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_998", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_539", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "einsum_default_306", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_305", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "permute_540", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_540", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "dtype_cast_343", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_343", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wv", - "name": "alias_default_1658", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_940", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_999", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_999", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_735", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "einsum_default_307", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_737", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "permute_543", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_999", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_543", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "einsum_default_308", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_306", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_308", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "add_200", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_307", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "permute_544", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_544", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "dtype_cast_344", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_344", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wk", - "name": "alias_default_1657", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_941", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention", - "name": "alias_default_1000", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1000", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_735", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "einsum_default_309", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_736", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "permute_547", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1000", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_547", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "einsum_default_310", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_200", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_310", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26", - "name": "add_201", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_309", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "permute_548", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_548", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "dtype_cast_345", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_345", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention.wq", - "name": "alias_default_1656", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_201", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_1046", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_731", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_1047", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_732", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_1048", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1046", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_1001", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1001", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1048", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_346", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1047", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_734", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_347", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_346", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_1002", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_347", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_1003", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1003", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1002", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_348", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "sum_37", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1003", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "div_44", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_349", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1002", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "sub_18", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_734", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_350", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1001", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1003", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "mul_351", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_351", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "sum_38", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_350", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_1049", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_38", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "convert_element_type_1050", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_997", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1049", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "add_202", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1050", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "dtype_cast_346", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_346", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.26.attention_norm", - "name": "alias_default_1663", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_202", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "alias_default_1004", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1004", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_729", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "einsum_default_311", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_730", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "permute_551", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1004", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_551", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "einsum_default_312", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_311", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "permute_552", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_552", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "dtype_cast_347", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_347", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "alias_default_1652", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_312", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w2", - "name": "alias_default_1005", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1005", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_726", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_352", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1005", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_728", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_353", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_352", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_1006", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1006", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_722", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "einsum_default_313", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_727", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "permute_555", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1006", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_555", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "einsum_default_314", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_313", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "permute_556", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_556", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "dtype_cast_348", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_348", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w3", - "name": "alias_default_1653", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_353", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "convert_element_type_1059", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_724", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "convert_element_type_1060", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1060", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_1007", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1007", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "neg_38", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "exp_38", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "add_203", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_203", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "reciprocal_6", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_6", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_354", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_354", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_1008", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1059", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1008", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_355", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1008", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "sub_19", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1007", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_356", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_356", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "add_204", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_355", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_204", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "mul_357", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_357", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "convert_element_type_1061", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1061", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward", - "name": "alias_default_1009", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1009", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_722", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "einsum_default_315", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_723", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "permute_559", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1009", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_559", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "einsum_default_316", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_314", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_316", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25", - "name": "add_205", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_315", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "permute_560", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_560", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "dtype_cast_349", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_349", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.feed_forward.w1", - "name": "alias_default_1651", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_205", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_1066", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_718", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_1067", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_719", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_1068", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1066", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_1010", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1010", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1068", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_358", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1067", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_721", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_359", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_358", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_1011", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_359", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_1012", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1012", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1011", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_360", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_360", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "sum_39", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1012", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "div_45", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_361", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1011", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_361", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "sub_20", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_721", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_362", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1010", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1012", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "mul_363", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_363", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "sum_40", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_1069", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_40", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "convert_element_type_1070", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1004", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1069", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "add_206", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1070", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "dtype_cast_350", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_350", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.ffn_norm", - "name": "alias_default_1655", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_206", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "alias_default_1013", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1013", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_716", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "einsum_default_317", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_717", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "permute_563", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1013", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_563", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "einsum_default_318", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_317", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "permute_564", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_564", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "dtype_cast_351", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_351", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wo", - "name": "alias_default_1650", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_318", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_956", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_956", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_565", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_565", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_712", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_713", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_714", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_715", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_226", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_231", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_232", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_6", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_306", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_307", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.sdpa", - "name": "getitem_308", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_308", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_566", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_307", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_567", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_306", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "permute_568", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_566", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_957", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_957", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "sum_41", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "squeeze_12", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_567", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_958", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_958", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "sum_42", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "squeeze_13", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_1075", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_568", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_1076", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1075", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_959", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_959", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_complex_76", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_711", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "_conj_12", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_12", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "clone_118", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_76", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_118", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "mul_364", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1076", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_960", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_960", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_complex_77", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_711", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "_conj_13", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_13", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "clone_119", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_77", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_119", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "mul_365", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_364", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_real_76", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_76", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_961", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_961", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_1077", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_365", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_as_real_77", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_77", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_962", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_962", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "convert_element_type_1078", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_963", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1077", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_964", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1078", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "view_965", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_963", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_1014", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1014", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_707", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "einsum_default_319", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_710", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "permute_571", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1014", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_571", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "einsum_default_320", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_319", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "permute_572", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_572", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "dtype_cast_352", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_352", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wv", - "name": "alias_default_1649", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_964", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_1015", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1015", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_707", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "einsum_default_321", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_709", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "permute_575", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1015", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_575", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "einsum_default_322", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_320", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "add_207", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_321", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "permute_576", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_576", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "dtype_cast_353", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_353", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wk", - "name": "alias_default_1648", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_965", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention", - "name": "alias_default_1016", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1016", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_707", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "einsum_default_323", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_708", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "permute_579", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1016", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_579", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "einsum_default_324", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_207", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_324", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25", - "name": "add_208", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_323", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "permute_580", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_580", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "dtype_cast_354", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_354", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention.wq", - "name": "alias_default_1647", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_208", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_1091", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_703", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_1092", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_704", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_1093", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1091", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_1017", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1017", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1093", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_366", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1092", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_706", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_367", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_366", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_1018", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_367", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_1019", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1019", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1018", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_368", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_368", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "sum_43", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1019", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "div_46", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_369", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1018", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "sub_21", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_706", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_370", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1017", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1019", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "mul_371", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_371", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "sum_44", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_1094", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_44", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "convert_element_type_1095", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1013", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1094", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "add_209", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1095", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "dtype_cast_355", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_355", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.25.attention_norm", - "name": "alias_default_1654", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_209", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "alias_default_1020", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1020", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_701", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "einsum_default_325", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_702", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "permute_583", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1020", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_583", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "einsum_default_326", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_325", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "permute_584", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_584", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "dtype_cast_356", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_356", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "alias_default_1643", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_326", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w2", - "name": "alias_default_1021", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1021", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_698", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_372", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1021", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_700", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_373", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_372", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_1022", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1022", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_694", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "einsum_default_327", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_699", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "permute_587", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1022", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_587", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "einsum_default_328", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_327", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "permute_588", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_588", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "dtype_cast_357", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_357", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w3", - "name": "alias_default_1644", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_373", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "convert_element_type_1104", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_696", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "convert_element_type_1105", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_1023", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1023", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "neg_39", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "exp_39", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "add_210", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_210", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "reciprocal_7", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_7", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_374", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_374", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_1024", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1024", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_375", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1024", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "sub_22", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1023", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_376", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_376", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "add_211", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_375", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_211", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "mul_377", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_377", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "convert_element_type_1106", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward", - "name": "alias_default_1025", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1025", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_694", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "einsum_default_329", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_695", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "permute_591", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1025", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_591", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "einsum_default_330", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_328", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_330", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24", - "name": "add_212", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_329", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "permute_592", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_592", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "dtype_cast_358", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_358", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.feed_forward.w1", - "name": "alias_default_1642", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_212", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_1111", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_690", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_1112", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_691", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_1113", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1111", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_1026", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1026", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1113", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_378", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1112", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_693", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_379", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_1027", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_379", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_1028", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1028", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1027", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_380", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "sum_45", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1028", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "div_47", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_381", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1027", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "sub_23", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_693", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_382", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1026", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1028", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "mul_383", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_383", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "sum_46", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_1114", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_46", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "convert_element_type_1115", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1020", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1114", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "add_213", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1115", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "dtype_cast_359", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_359", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.ffn_norm", - "name": "alias_default_1646", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_213", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "alias_default_1029", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1029", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_688", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "einsum_default_331", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_689", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "permute_595", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1029", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_595", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "einsum_default_332", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_331", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "permute_596", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_596", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "dtype_cast_360", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_360", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wo", - "name": "alias_default_1641", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_332", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_980", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_980", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_597", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_597", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_684", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_685", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_686", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_687", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_217", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_222", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_223", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_7", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_309", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_310", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_7", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.sdpa", - "name": "getitem_311", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_598", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_310", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_599", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_309", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "permute_600", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_598", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_981", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_981", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "sum_47", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "squeeze_14", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_599", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_982", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_982", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "sum_48", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "squeeze_15", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_1120", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_600", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_1121", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_983", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_983", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_complex_78", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_683", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "_conj_14", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_14", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "clone_126", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_78", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_126", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "mul_384", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_984", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_984", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_complex_79", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_683", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "_conj_15", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_15", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "clone_127", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_79", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_127", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "mul_385", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_384", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_real_78", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_78", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_985", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_985", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_1122", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_385", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_as_real_79", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_79", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_986", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_986", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "convert_element_type_1123", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_987", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1122", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_988", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1123", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "view_989", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_987", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_1030", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1030", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_679", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "einsum_default_333", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_682", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "permute_603", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1030", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_603", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "einsum_default_334", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_333", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "permute_604", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_604", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "dtype_cast_361", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_361", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wv", - "name": "alias_default_1640", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_988", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_1031", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1031", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_679", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "einsum_default_335", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_681", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "permute_607", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1031", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_607", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "einsum_default_336", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_334", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_336", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "add_214", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_335", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "permute_608", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_608", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "dtype_cast_362", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_362", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wk", - "name": "alias_default_1639", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_989", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention", - "name": "alias_default_1032", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1032", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_679", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "einsum_default_337", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_680", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "permute_611", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1032", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_611", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "einsum_default_338", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_338", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24", - "name": "add_215", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_337", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "permute_612", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_612", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "dtype_cast_363", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_363", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention.wq", - "name": "alias_default_1638", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_215", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_1136", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_675", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_1137", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_676", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_1138", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1136", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_1033", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1033", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1138", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_386", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1137", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_678", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_387", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_386", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_1034", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_387", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_1035", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1035", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1034", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_388", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_388", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "sum_49", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1035", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "div_48", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_389", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1034", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_389", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "sub_24", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_678", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_390", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1033", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1035", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "mul_391", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_391", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "sum_50", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_390", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_1139", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_50", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "convert_element_type_1140", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1029", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1139", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "add_216", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1140", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "dtype_cast_364", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_364", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.24.attention_norm", - "name": "alias_default_1645", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_216", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "alias_default_1036", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1036", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_673", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "einsum_default_339", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_674", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "permute_615", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1036", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_615", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "einsum_default_340", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_339", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "permute_616", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_616", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "dtype_cast_365", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_365", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "alias_default_1634", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_340", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w2", - "name": "alias_default_1037", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1037", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_670", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_392", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1037", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_672", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_393", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_1038", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1038", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_666", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "einsum_default_341", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_671", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "permute_619", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1038", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_619", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "einsum_default_342", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_341", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "permute_620", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_620", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "dtype_cast_366", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_366", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w3", - "name": "alias_default_1635", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "convert_element_type_1149", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_668", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "convert_element_type_1150", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1150", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_1039", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1039", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "neg_40", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "exp_40", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "add_217", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_217", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "reciprocal_8", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_8", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_394", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_394", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_1040", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1040", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_395", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1040", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "sub_25", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1039", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_396", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_396", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "add_218", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_395", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_218", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "mul_397", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_397", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "convert_element_type_1151", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1151", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward", - "name": "alias_default_1041", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1041", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_666", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "einsum_default_343", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_667", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "permute_623", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1041", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_623", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "einsum_default_344", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_342", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_344", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23", - "name": "add_219", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_343", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "permute_624", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_624", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "dtype_cast_367", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_367", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.feed_forward.w1", - "name": "alias_default_1633", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_219", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_1156", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_662", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_1157", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_663", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_1158", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_1042", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1042", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1158", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_398", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_665", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_399", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_398", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_1043", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_399", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_1044", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1044", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1043", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_400", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_400", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "sum_51", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1044", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "div_49", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_401", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1043", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_401", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "sub_26", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_665", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_402", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1042", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1044", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "mul_403", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_403", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "sum_52", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_402", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_1159", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_52", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "convert_element_type_1160", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1036", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1159", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "add_220", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1160", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "dtype_cast_368", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_368", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.ffn_norm", - "name": "alias_default_1637", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_220", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "alias_default_1045", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1045", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_660", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "einsum_default_345", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_661", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "permute_627", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1045", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_627", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "einsum_default_346", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_345", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "permute_628", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_628", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "dtype_cast_369", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_369", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wo", - "name": "alias_default_1632", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_346", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1004", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1004", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_629", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_629", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_656", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_657", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_658", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_659", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_208", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_213", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_214", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_8", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_312", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_313", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_8", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.sdpa", - "name": "getitem_314", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_314", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_630", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_313", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_631", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_312", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "permute_632", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_630", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1005", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1005", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "sum_53", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "squeeze_16", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_631", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1006", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1006", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "sum_54", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "squeeze_17", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_1165", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_632", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_1166", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1007", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1007", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_complex_80", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_655", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "_conj_16", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_16", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "clone_134", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_80", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_134", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "mul_404", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1166", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1008", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1008", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_complex_81", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_655", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "_conj_17", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_17", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "clone_135", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_135", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "mul_405", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_404", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_real_80", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_80", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1009", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1009", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_1167", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_405", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_as_real_81", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_81", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1010", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1010", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "convert_element_type_1168", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1011", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1167", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1012", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "view_1013", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1011", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_1046", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1046", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_651", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "einsum_default_347", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_654", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "permute_635", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1046", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_635", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "einsum_default_348", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_347", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "permute_636", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_636", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "dtype_cast_370", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_370", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wv", - "name": "alias_default_1631", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1012", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_1047", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1047", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_651", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "einsum_default_349", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_653", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "permute_639", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1047", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_639", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "einsum_default_350", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_350", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "add_221", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_349", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "permute_640", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_640", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "dtype_cast_371", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_371", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wk", - "name": "alias_default_1630", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1013", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention", - "name": "alias_default_1048", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1048", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_651", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "einsum_default_351", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_652", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "permute_643", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1048", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_643", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "einsum_default_352", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_221", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_352", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23", - "name": "add_222", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_351", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "permute_644", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_644", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "dtype_cast_372", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_372", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention.wq", - "name": "alias_default_1629", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_222", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_1181", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_647", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_1182", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_648", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_1183", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1181", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_1049", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1049", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1183", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_406", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1182", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_650", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_407", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_406", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_1050", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_407", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_1051", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1051", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1050", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_408", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_408", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "sum_55", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1051", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "div_50", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_409", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1050", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_409", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "sub_27", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_650", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_410", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1049", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1051", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "mul_411", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_411", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "sum_56", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_410", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_1184", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_56", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "convert_element_type_1185", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1045", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1184", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "add_223", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1185", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "dtype_cast_373", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_373", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.23.attention_norm", - "name": "alias_default_1636", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_223", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "alias_default_1052", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1052", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_645", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "einsum_default_353", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_646", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "permute_647", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1052", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_647", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "einsum_default_354", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_353", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "permute_648", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_648", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "dtype_cast_374", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_374", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "alias_default_1625", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_354", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w2", - "name": "alias_default_1053", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1053", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_642", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_412", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1053", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_644", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_413", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_412", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_1054", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1054", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_638", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "einsum_default_355", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_643", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "permute_651", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1054", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_651", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "einsum_default_356", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_355", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "permute_652", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_652", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "dtype_cast_375", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_375", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w3", - "name": "alias_default_1626", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_413", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "convert_element_type_1194", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_640", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "convert_element_type_1195", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1195", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_1055", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1055", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "neg_41", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "exp_41", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "add_224", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "reciprocal_9", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_9", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_414", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_414", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_1056", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1194", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1056", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_415", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1056", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "sub_28", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1055", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_416", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_416", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "add_225", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_415", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_225", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "mul_417", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_417", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "convert_element_type_1196", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1196", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward", - "name": "alias_default_1057", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1057", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_638", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "einsum_default_357", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_639", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "permute_655", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1057", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_655", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "einsum_default_358", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_356", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_358", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22", - "name": "add_226", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_357", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "permute_656", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_656", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "dtype_cast_376", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_376", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.feed_forward.w1", - "name": "alias_default_1624", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_226", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_1201", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_634", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_1202", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_635", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_1203", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1201", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_1058", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1058", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1203", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_418", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_637", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_419", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_418", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_1059", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_419", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_1060", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1060", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1059", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_420", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_420", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "sum_57", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1060", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "div_51", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_421", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1059", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_421", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "sub_29", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_637", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_422", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1058", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1060", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "mul_423", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_423", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "sum_58", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_422", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_1204", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_58", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "convert_element_type_1205", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1052", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1204", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "add_227", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1205", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "dtype_cast_377", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_377", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.ffn_norm", - "name": "alias_default_1628", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "alias_default_1061", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1061", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_632", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "einsum_default_359", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_633", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "permute_659", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1061", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_659", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "einsum_default_360", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_359", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "permute_660", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_660", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "dtype_cast_378", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_378", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wo", - "name": "alias_default_1623", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_360", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1028", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1028", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_661", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_661", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_628", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_629", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_630", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_631", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_199", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_204", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_205", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_9", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_315", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_316", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_9", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.sdpa", - "name": "getitem_317", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_317", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_662", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_316", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_663", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_315", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "permute_664", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_662", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1029", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1029", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "sum_59", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "squeeze_18", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_663", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1030", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1030", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "sum_60", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "squeeze_19", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_1210", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_664", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_1211", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1210", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1031", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1031", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_complex_82", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_627", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "_conj_18", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_18", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "clone_142", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_142", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "mul_424", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1211", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1032", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1032", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_complex_83", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_627", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "_conj_19", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_19", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "clone_143", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_143", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "mul_425", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_424", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_real_82", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1033", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1033", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_1212", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_425", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_as_real_83", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1034", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1034", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "convert_element_type_1213", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1035", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1212", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1036", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1213", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "view_1037", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1035", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_1062", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1062", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_623", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "einsum_default_361", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_626", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "permute_667", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1062", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_667", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "einsum_default_362", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_361", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "permute_668", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_668", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "dtype_cast_379", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_379", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wv", - "name": "alias_default_1622", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1036", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_1063", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1063", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_623", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "einsum_default_363", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_625", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "permute_671", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1063", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_671", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "einsum_default_364", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_364", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "add_228", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_363", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "permute_672", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_672", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "dtype_cast_380", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_380", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wk", - "name": "alias_default_1621", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1037", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention", - "name": "alias_default_1064", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1064", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_623", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "einsum_default_365", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_624", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "permute_675", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1064", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_675", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "einsum_default_366", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_228", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_366", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22", - "name": "add_229", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_365", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "permute_676", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_676", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "dtype_cast_381", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_381", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention.wq", - "name": "alias_default_1620", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_229", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_1226", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_619", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_1227", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_620", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_1228", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1226", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_1065", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1065", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1228", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_426", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_622", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_427", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_426", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_1066", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_427", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_1067", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1067", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1066", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_428", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_428", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "sum_61", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1067", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "div_52", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_52", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_429", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1066", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_429", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "sub_30", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_622", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_430", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1065", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1067", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "mul_431", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_431", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "sum_62", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_430", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_1229", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_62", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "convert_element_type_1230", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1061", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1229", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "add_230", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1230", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "dtype_cast_382", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_382", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.22.attention_norm", - "name": "alias_default_1627", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_230", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "alias_default_1068", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1068", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_617", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "einsum_default_367", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_618", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "permute_679", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1068", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_679", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "einsum_default_368", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_367", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "permute_680", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_680", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "dtype_cast_383", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_383", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "alias_default_1616", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_368", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w2", - "name": "alias_default_1069", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1069", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_614", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_432", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1069", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_616", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_433", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_432", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_1070", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1070", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_610", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "einsum_default_369", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_615", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "permute_683", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1070", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_683", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "einsum_default_370", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_369", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "permute_684", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_684", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "dtype_cast_384", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_384", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w3", - "name": "alias_default_1617", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_433", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "convert_element_type_1239", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_612", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "convert_element_type_1240", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1240", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_1071", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1071", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "neg_42", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "exp_42", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "add_231", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_231", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "reciprocal_10", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_10", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_434", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_434", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_1072", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1239", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1072", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_435", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1072", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "sub_31", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1071", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_436", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_436", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "add_232", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_435", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_232", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "mul_437", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_437", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "convert_element_type_1241", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1241", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward", - "name": "alias_default_1073", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1073", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_610", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "einsum_default_371", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_611", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "permute_687", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1073", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_687", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "einsum_default_372", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_370", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_372", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21", - "name": "add_233", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_371", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "permute_688", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_688", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "dtype_cast_385", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_385", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.feed_forward.w1", - "name": "alias_default_1615", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_233", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_1246", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_606", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_1247", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_607", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_1248", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1246", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_1074", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1074", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1248", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_438", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1247", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_609", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_439", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_438", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_1075", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_439", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_1076", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1076", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1075", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_440", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_440", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "sum_63", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1076", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "div_53", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_53", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_441", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1075", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_441", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "sub_32", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_32", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_609", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_442", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1074", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1076", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "mul_443", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_443", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "sum_64", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_442", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_1249", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_64", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "convert_element_type_1250", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1068", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1249", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "add_234", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1250", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "dtype_cast_386", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_386", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.ffn_norm", - "name": "alias_default_1619", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_234", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "alias_default_1077", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1077", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_604", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "einsum_default_373", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_605", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "permute_691", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1077", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_691", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "einsum_default_374", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_373", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "permute_692", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_692", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "dtype_cast_387", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_387", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wo", - "name": "alias_default_1614", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_374", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1052", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1052", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_693", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_693", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_600", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_601", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_602", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_603", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_190", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_195", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_196", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_10", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_318", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_319", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.sdpa", - "name": "getitem_320", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_320", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_694", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_319", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_695", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_318", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "permute_696", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_694", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1053", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1053", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "sum_65", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_65", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "squeeze_20", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_695", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1054", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1054", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "sum_66", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_66", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "squeeze_21", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_1255", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_696", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_1256", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1255", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1055", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1055", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_complex_84", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_599", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "_conj_20", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_20", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "clone_150", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_150", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "mul_444", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1256", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1056", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1056", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_complex_85", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_599", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "_conj_21", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_21", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "clone_151", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "mul_445", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_444", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_real_84", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1057", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1057", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_1257", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_445", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_as_real_85", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1058", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1058", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "convert_element_type_1258", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1059", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1257", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1060", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1258", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "view_1061", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1059", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_1078", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1078", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_595", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "einsum_default_375", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_598", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "permute_699", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1078", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_699", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "einsum_default_376", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_375", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "permute_700", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_700", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "dtype_cast_388", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_388", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wv", - "name": "alias_default_1613", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1060", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_1079", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1079", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_595", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "einsum_default_377", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_597", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "permute_703", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1079", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_703", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "einsum_default_378", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_376", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "add_235", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_377", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "permute_704", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_704", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "dtype_cast_389", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_389", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wk", - "name": "alias_default_1612", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1061", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention", - "name": "alias_default_1080", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1080", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_595", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "einsum_default_379", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_596", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "permute_707", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1080", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_707", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "einsum_default_380", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_235", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21", - "name": "add_236", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_379", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "permute_708", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_708", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "dtype_cast_390", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_390", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention.wq", - "name": "alias_default_1611", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_1271", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_591", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_1272", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_592", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_1273", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1271", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_1081", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1081", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1273", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_446", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1272", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_594", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_447", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_446", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_1082", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_447", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_1083", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1083", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1082", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_448", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_448", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "sum_67", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1083", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "div_54", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_67", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_449", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1082", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_449", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "sub_33", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_33", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_594", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_450", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1081", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1083", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "mul_451", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_451", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "sum_68", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_450", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_1274", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_68", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "convert_element_type_1275", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1077", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1274", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "add_237", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1275", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "dtype_cast_391", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_391", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.21.attention_norm", - "name": "alias_default_1618", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_237", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "alias_default_1084", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1084", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_589", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "einsum_default_381", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_590", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "permute_711", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1084", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_711", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "einsum_default_382", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_381", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "permute_712", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_712", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "dtype_cast_392", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_392", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "alias_default_1607", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_382", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w2", - "name": "alias_default_1085", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1085", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_586", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_452", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1085", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_588", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_453", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_452", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_1086", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1086", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_582", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "einsum_default_383", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_587", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "permute_715", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1086", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_715", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "einsum_default_384", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_383", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "permute_716", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_716", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "dtype_cast_393", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_393", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w3", - "name": "alias_default_1608", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_453", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "convert_element_type_1284", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_584", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "convert_element_type_1285", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1285", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_1087", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1087", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "neg_43", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "exp_43", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "add_238", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_238", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "reciprocal_11", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_11", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_454", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_454", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_1088", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1284", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1088", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_455", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1088", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "sub_34", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1087", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_456", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_456", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "add_239", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_455", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_239", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "mul_457", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_457", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "convert_element_type_1286", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1286", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward", - "name": "alias_default_1089", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1089", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_582", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "einsum_default_385", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_583", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "permute_719", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1089", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_719", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "einsum_default_386", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_384", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_386", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20", - "name": "add_240", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_385", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "permute_720", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_720", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "dtype_cast_394", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_394", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.feed_forward.w1", - "name": "alias_default_1606", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_240", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_1291", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_578", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_1292", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_579", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_1293", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_1090", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1090", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1293", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_458", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1292", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_581", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_459", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_458", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_1091", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_459", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_1092", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1092", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1091", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_460", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_460", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "sum_69", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1092", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "div_55", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_461", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1091", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_461", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "sub_35", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_35", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_581", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_462", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1090", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1092", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "mul_463", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_463", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "sum_70", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_462", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_1294", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_70", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "convert_element_type_1295", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1084", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1294", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "add_241", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1295", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "dtype_cast_395", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_395", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.ffn_norm", - "name": "alias_default_1610", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_241", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "alias_default_1093", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1093", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_576", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "einsum_default_387", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_577", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "permute_723", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1093", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_723", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "einsum_default_388", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_387", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "permute_724", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_724", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "dtype_cast_396", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_396", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wo", - "name": "alias_default_1605", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_388", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1076", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1076", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_725", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_725", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_572", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_573", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_574", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_575", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_181", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_186", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_187", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_11", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_321", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_322", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_11", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.sdpa", - "name": "getitem_323", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_726", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_727", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_321", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "permute_728", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_726", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1077", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1077", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "sum_71", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_71", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "squeeze_22", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_727", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1078", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1078", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "sum_72", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_72", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "squeeze_23", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_1300", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_728", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_1301", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1300", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1079", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1079", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_complex_86", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_571", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "_conj_22", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_22", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "clone_158", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_86", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_158", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "mul_464", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1301", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1080", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1080", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_complex_87", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_571", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "_conj_23", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_23", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "clone_159", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_87", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_159", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "mul_465", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_464", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_real_86", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_86", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1081", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1081", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_1302", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_465", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_as_real_87", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_87", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1082", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1082", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "convert_element_type_1303", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1083", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1302", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1084", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1303", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "view_1085", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1083", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_1094", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1094", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_567", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "einsum_default_389", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_570", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "permute_731", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1094", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_731", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "einsum_default_390", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_389", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "permute_732", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_732", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "dtype_cast_397", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_397", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wv", - "name": "alias_default_1604", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1084", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_1095", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1095", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_567", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "einsum_default_391", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_569", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "permute_735", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1095", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_735", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "einsum_default_392", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_390", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_392", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "add_242", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_391", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "permute_736", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_736", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "dtype_cast_398", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_398", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wk", - "name": "alias_default_1603", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1085", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention", - "name": "alias_default_1096", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1096", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_567", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "einsum_default_393", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_568", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "permute_739", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1096", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_739", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "einsum_default_394", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_394", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20", - "name": "add_243", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_393", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "permute_740", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_740", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "dtype_cast_399", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_399", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention.wq", - "name": "alias_default_1602", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_243", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_1316", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_563", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_1317", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_564", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_1318", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1316", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_1097", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1097", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1318", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_466", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1317", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_566", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_467", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_466", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_1098", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_467", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_1099", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1099", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1098", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_468", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_468", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "sum_73", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1099", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "div_56", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_73", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_469", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1098", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_469", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "sub_36", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_36", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_566", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_470", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1097", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1099", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "mul_471", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_471", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "sum_74", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_470", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_1319", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_74", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "convert_element_type_1320", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1093", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1319", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "add_244", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1320", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "dtype_cast_400", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_400", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.20.attention_norm", - "name": "alias_default_1609", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_244", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "alias_default_1100", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1100", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_561", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "einsum_default_395", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_562", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "permute_743", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1100", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_743", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "einsum_default_396", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_395", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "permute_744", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_744", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "dtype_cast_401", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_401", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "alias_default_1598", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_396", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w2", - "name": "alias_default_1101", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_558", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_472", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_560", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_473", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_472", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_1102", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_554", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "einsum_default_397", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_559", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "permute_747", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_747", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "einsum_default_398", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_397", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "permute_748", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_748", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "dtype_cast_402", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_402", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w3", - "name": "alias_default_1599", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_473", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "convert_element_type_1329", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_556", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "convert_element_type_1330", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1330", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_1103", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "neg_44", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "exp_44", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "add_245", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_245", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "reciprocal_12", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_12", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_474", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_474", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_1104", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1329", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_475", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "sub_37", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_476", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_476", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "add_246", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_475", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_246", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "mul_477", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_477", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "convert_element_type_1331", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1331", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward", - "name": "alias_default_1105", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_554", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "einsum_default_399", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_555", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "permute_751", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_751", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "einsum_default_400", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_398", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_400", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19", - "name": "add_247", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_399", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "permute_752", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_752", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "dtype_cast_403", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_403", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.feed_forward.w1", - "name": "alias_default_1597", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_247", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_1336", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_550", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_1337", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_551", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_1338", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1336", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_1106", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1106", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1338", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_478", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1337", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_553", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_479", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_478", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_1107", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_479", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_1108", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1108", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1107", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_480", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_480", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "sum_75", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1108", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "div_57", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_75", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_481", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1107", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_481", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "sub_38", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_38", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_553", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_482", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1106", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1108", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "mul_483", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_483", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "sum_76", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_482", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_1339", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_76", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "convert_element_type_1340", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1100", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "add_248", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1340", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "dtype_cast_404", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_404", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.ffn_norm", - "name": "alias_default_1601", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_248", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "alias_default_1109", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_548", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "einsum_default_401", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_549", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "permute_755", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_755", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "einsum_default_402", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_401", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "permute_756", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_756", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "dtype_cast_405", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_405", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wo", - "name": "alias_default_1596", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_402", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1100", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1100", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_757", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_757", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_544", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_545", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_546", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_547", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_177", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_178", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_12", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_324", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_325", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.sdpa", - "name": "getitem_326", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_758", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_325", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_759", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_324", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "permute_760", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_758", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1101", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "sum_77", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_77", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "squeeze_24", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_759", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1102", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "sum_78", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_78", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "squeeze_25", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_1345", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_760", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_1346", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1345", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1103", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_complex_88", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_543", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "_conj_24", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_24", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "clone_166", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_88", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_166", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "mul_484", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1346", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1104", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_complex_89", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_543", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "_conj_25", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_25", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "clone_167", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_89", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_167", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "mul_485", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_484", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_real_88", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_88", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1105", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_1347", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_485", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_as_real_89", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_89", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1106", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "convert_element_type_1348", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1107", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1347", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1108", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1348", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "view_1109", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_1110", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_539", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "einsum_default_403", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_542", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "permute_763", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1110", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_763", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "einsum_default_404", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_403", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "permute_764", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_764", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "dtype_cast_406", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_406", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wv", - "name": "alias_default_1595", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_1111", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1111", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_539", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "einsum_default_405", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_541", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "permute_767", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1111", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_767", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "einsum_default_406", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_404", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_406", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "add_249", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_405", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "permute_768", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_768", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "dtype_cast_407", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_407", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wk", - "name": "alias_default_1594", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention", - "name": "alias_default_1112", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_539", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "einsum_default_407", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_540", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "permute_771", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1112", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_771", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "einsum_default_408", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_249", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_408", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19", - "name": "add_250", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_407", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "permute_772", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_772", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "dtype_cast_408", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_408", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention.wq", - "name": "alias_default_1593", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_250", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_1361", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_535", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_1362", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_536", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_1363", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1361", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_1113", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1113", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1363", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_486", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_538", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_487", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_486", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_1114", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_487", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_1115", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1114", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_488", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_488", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "sum_79", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "div_58", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_58", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_79", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_489", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1114", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_489", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "sub_39", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_39", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_538", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_490", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1113", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "mul_491", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_491", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "sum_80", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_490", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_1364", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_80", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "convert_element_type_1365", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1364", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "add_251", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1365", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "dtype_cast_409", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_409", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.19.attention_norm", - "name": "alias_default_1600", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_251", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "alias_default_1116", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1116", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_533", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "einsum_default_409", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_534", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "permute_775", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1116", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_775", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "einsum_default_410", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_409", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "permute_776", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_776", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "dtype_cast_410", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_410", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "alias_default_1589", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_410", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w2", - "name": "alias_default_1117", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1117", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_530", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_492", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1117", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_532", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_493", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_492", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_1118", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1118", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_526", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "einsum_default_411", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_531", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "permute_779", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1118", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_779", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "einsum_default_412", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_411", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "permute_780", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_780", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "dtype_cast_411", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_411", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w3", - "name": "alias_default_1590", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_493", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "convert_element_type_1374", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_528", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "convert_element_type_1375", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1375", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_1119", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1119", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "neg_45", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "exp_45", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "add_252", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_252", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "reciprocal_13", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_13", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_494", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_494", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_1120", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1374", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_495", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "sub_40", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1119", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_496", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_496", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "add_253", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_495", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_253", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "mul_497", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_497", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "convert_element_type_1376", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1376", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward", - "name": "alias_default_1121", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_526", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "einsum_default_413", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_527", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "permute_783", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_783", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "einsum_default_414", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_412", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_414", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18", - "name": "add_254", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_413", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "permute_784", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_784", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "dtype_cast_412", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_412", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.feed_forward.w1", - "name": "alias_default_1588", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_254", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_1381", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_522", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_1382", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_523", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_1383", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_1122", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1122", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1383", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_498", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_525", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_499", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_498", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_1123", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_499", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_1124", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1123", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_500", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_500", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "sum_81", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "div_59", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_501", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1123", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_501", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "sub_41", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_525", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_502", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1122", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "mul_503", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_503", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "sum_82", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_502", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_1384", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_82", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "convert_element_type_1385", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1116", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1384", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "add_255", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1385", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "dtype_cast_413", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_413", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.ffn_norm", - "name": "alias_default_1592", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_255", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "alias_default_1125", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_520", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "einsum_default_415", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_521", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "permute_787", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_787", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "einsum_default_416", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_415", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "permute_788", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_788", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "dtype_cast_414", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_414", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wo", - "name": "alias_default_1587", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_416", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1124", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1124", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_789", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_789", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_516", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_517", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_518", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_519", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_163", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_168", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_169", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_13", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_327", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_328", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.sdpa", - "name": "getitem_329", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_329", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_790", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_328", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_791", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_327", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "permute_792", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_790", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1125", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1125", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "sum_83", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_83", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "squeeze_26", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_791", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1126", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1126", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "sum_84", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "squeeze_27", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_1390", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_792", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_1391", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1127", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1127", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_complex_90", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_515", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "_conj_26", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_26", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "clone_174", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_90", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_174", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "mul_504", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1391", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1128", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1128", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_complex_91", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_515", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "_conj_27", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_27", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "clone_175", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_91", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_175", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "mul_505", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_504", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_real_90", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_90", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1129", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1129", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_1392", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_505", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_as_real_91", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_91", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1130", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1130", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "convert_element_type_1393", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1131", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1132", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "view_1133", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1131", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_1126", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1126", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_511", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "einsum_default_417", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_514", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "permute_795", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1126", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_795", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "einsum_default_418", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_417", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "permute_796", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_796", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "dtype_cast_415", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_415", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wv", - "name": "alias_default_1586", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1132", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_1127", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1127", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_511", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "einsum_default_419", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_513", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "permute_799", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1127", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_799", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "einsum_default_420", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_418", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_420", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "add_256", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_419", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "permute_800", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_800", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "dtype_cast_416", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_416", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wk", - "name": "alias_default_1585", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1133", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention", - "name": "alias_default_1128", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1128", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_511", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "einsum_default_421", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_512", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "permute_803", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1128", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_803", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "einsum_default_422", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_256", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_422", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18", - "name": "add_257", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_421", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "permute_804", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_804", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "dtype_cast_417", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_417", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention.wq", - "name": "alias_default_1584", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_257", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_1406", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_507", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_1407", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_508", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_1408", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1406", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_1129", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1129", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1408", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_506", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1407", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_510", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_507", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_506", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_1130", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_507", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_1131", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1131", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_508", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_508", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "sum_85", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1131", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "div_60", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_85", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_509", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_509", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "sub_42", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_510", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_510", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1129", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1131", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "mul_511", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_511", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "sum_86", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_510", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_1409", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_86", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "convert_element_type_1410", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1409", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "add_258", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1410", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "dtype_cast_418", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_418", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.18.attention_norm", - "name": "alias_default_1591", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_258", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "alias_default_1132", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1132", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_505", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "einsum_default_423", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_506", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "permute_807", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1132", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_807", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "einsum_default_424", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_423", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "permute_808", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_808", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "dtype_cast_419", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_419", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "alias_default_1580", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_424", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w2", - "name": "alias_default_1133", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1133", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_502", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_512", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1133", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_504", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_513", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_512", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_1134", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1134", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_498", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "einsum_default_425", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_503", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "permute_811", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1134", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_811", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "einsum_default_426", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_425", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "permute_812", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_812", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "dtype_cast_420", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_420", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w3", - "name": "alias_default_1581", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_513", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "convert_element_type_1419", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_500", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "convert_element_type_1420", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1420", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_1135", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1135", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "neg_46", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "exp_46", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "add_259", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_259", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "reciprocal_14", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_14", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_514", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_514", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_1136", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1419", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1136", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_515", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1136", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "sub_43", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1135", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_516", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_516", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "add_260", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_515", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_260", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "mul_517", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_517", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "convert_element_type_1421", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1421", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward", - "name": "alias_default_1137", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_498", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "einsum_default_427", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_499", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "permute_815", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_815", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "einsum_default_428", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_426", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_428", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17", - "name": "add_261", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_427", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "permute_816", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_816", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "dtype_cast_421", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_421", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.feed_forward.w1", - "name": "alias_default_1579", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_261", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_1426", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_494", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_1427", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_495", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_1428", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1426", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_1138", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1138", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1428", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_518", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1427", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_497", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_519", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_518", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_1139", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_519", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_1140", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1140", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1139", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_520", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_520", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "sum_87", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1140", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "div_61", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_61", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_521", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1139", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_521", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "sub_44", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_497", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_522", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1138", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1140", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "mul_523", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_523", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "sum_88", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_522", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_1429", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_88", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "convert_element_type_1430", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1132", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1429", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "add_262", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1430", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "dtype_cast_422", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_422", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.ffn_norm", - "name": "alias_default_1583", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_262", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "alias_default_1141", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_492", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "einsum_default_429", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_493", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "permute_819", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_819", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "einsum_default_430", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_429", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "permute_820", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_820", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "dtype_cast_423", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_423", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wo", - "name": "alias_default_1578", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_430", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1148", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1148", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_821", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_821", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_488", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_489", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_490", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_491", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_159", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_160", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_14", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_330", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_331", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.sdpa", - "name": "getitem_332", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_332", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_822", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_331", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_823", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_330", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "permute_824", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_822", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1149", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "sum_89", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_89", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "squeeze_28", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_823", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1150", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1150", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "sum_90", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_90", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "squeeze_29", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_1435", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_824", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_1436", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1435", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1151", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1151", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_complex_92", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_487", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "_conj_28", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_28", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "clone_182", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_92", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_182", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "mul_524", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1436", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1152", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1152", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_complex_93", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_487", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "_conj_29", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_29", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "clone_183", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_93", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_183", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "mul_525", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_524", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_real_92", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_92", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1153", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1153", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_1437", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_525", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_as_real_93", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_93", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1154", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1154", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "convert_element_type_1438", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1155", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1437", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1156", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1438", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "view_1157", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1155", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_1142", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1142", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_483", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "einsum_default_431", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_486", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "permute_827", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1142", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_827", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "einsum_default_432", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_431", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "permute_828", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_828", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "dtype_cast_424", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_424", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wv", - "name": "alias_default_1577", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1156", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_1143", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1143", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_483", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "einsum_default_433", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_485", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "permute_831", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1143", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_831", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "einsum_default_434", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_432", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_434", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "add_263", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_433", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "permute_832", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_832", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "dtype_cast_425", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_425", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wk", - "name": "alias_default_1576", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1157", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention", - "name": "alias_default_1144", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1144", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_483", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "einsum_default_435", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_484", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "permute_835", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1144", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_835", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "einsum_default_436", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_263", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_436", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17", - "name": "add_264", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_435", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "permute_836", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_836", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "dtype_cast_426", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_426", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention.wq", - "name": "alias_default_1575", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_264", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_1451", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_479", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_1452", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_480", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_1453", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1451", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_1145", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1453", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_526", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1452", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_482", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_527", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_526", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_1146", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_527", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_1147", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_528", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_528", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "sum_91", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "div_62", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_529", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_529", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "sub_45", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_45", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_482", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_530", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "mul_531", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_531", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "sum_92", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_530", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_1454", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_92", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "convert_element_type_1455", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1454", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "add_265", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1455", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "dtype_cast_427", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_427", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.17.attention_norm", - "name": "alias_default_1582", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_265", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "alias_default_1148", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1148", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_477", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "einsum_default_437", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_478", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "permute_839", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1148", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_839", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "einsum_default_438", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_437", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "permute_840", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_840", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "dtype_cast_428", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_428", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "alias_default_1571", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_438", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w2", - "name": "alias_default_1149", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_474", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_532", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_476", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_533", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_532", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_1150", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1150", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_470", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "einsum_default_439", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_475", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "permute_843", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1150", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_843", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "einsum_default_440", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_439", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "permute_844", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_844", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "dtype_cast_429", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_429", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w3", - "name": "alias_default_1572", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_533", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "convert_element_type_1464", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_472", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "convert_element_type_1465", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1465", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_1151", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1151", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "neg_47", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "exp_47", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "add_266", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_266", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "reciprocal_15", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_15", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_534", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_534", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_1152", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1464", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1152", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_535", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1152", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "sub_46", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1151", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_536", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_536", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "add_267", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_535", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_267", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "mul_537", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_537", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "convert_element_type_1466", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1466", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward", - "name": "alias_default_1153", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1153", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_470", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "einsum_default_441", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_471", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "permute_847", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1153", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_847", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "einsum_default_442", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_440", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_442", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16", - "name": "add_268", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_441", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "permute_848", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_848", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "dtype_cast_430", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_430", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.feed_forward.w1", - "name": "alias_default_1570", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_268", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_1471", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_466", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_1472", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_467", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_1473", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1471", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_1154", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1473", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_538", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1472", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_469", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_539", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_538", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_1155", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_539", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_1156", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1155", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_540", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_540", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "sum_93", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "div_63", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_93", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_541", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1155", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_541", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "sub_47", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_47", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_469", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_542", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "mul_543", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_543", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "sum_94", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_542", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_1474", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_94", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "convert_element_type_1475", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1148", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1474", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "add_269", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1475", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "dtype_cast_431", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_431", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.ffn_norm", - "name": "alias_default_1574", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "alias_default_1157", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_464", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "einsum_default_443", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_465", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "permute_851", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_851", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "einsum_default_444", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_443", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "permute_852", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_852", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "dtype_cast_432", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_432", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wo", - "name": "alias_default_1569", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_444", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1172", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1172", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_853", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_853", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_460", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_461", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_462", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_463", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_150", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_15", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_333", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_334", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.sdpa", - "name": "getitem_335", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_335", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_854", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_334", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_855", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "permute_856", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_854", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1173", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1173", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "sum_95", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_95", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "squeeze_30", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_855", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1174", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1174", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "sum_96", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_96", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "squeeze_31", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_1480", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_856", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_1481", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1480", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1175", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1175", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_complex_94", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_459", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "_conj_30", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_30", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "clone_190", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_94", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_190", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "mul_544", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1481", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1176", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1176", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_complex_95", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_459", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "_conj_31", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_31", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "clone_191", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_95", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_191", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "mul_545", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_544", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_real_94", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_94", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1177", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1177", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_1482", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_545", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_as_real_95", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_95", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1178", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1178", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "convert_element_type_1483", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1179", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1482", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1180", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1483", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "view_1181", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1179", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_1158", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1158", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_455", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "einsum_default_445", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_458", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "permute_859", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1158", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_859", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "einsum_default_446", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_445", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "permute_860", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_860", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "dtype_cast_433", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_433", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wv", - "name": "alias_default_1568", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1180", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_1159", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1159", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_455", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "einsum_default_447", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_457", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "permute_863", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1159", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_863", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "einsum_default_448", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_446", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_448", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "add_270", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_447", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "permute_864", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_864", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "dtype_cast_434", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_434", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wk", - "name": "alias_default_1567", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1181", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention", - "name": "alias_default_1160", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1160", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_455", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "einsum_default_449", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_456", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "permute_867", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1160", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_867", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "einsum_default_450", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_270", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_450", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16", - "name": "add_271", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_449", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "permute_868", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_868", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "dtype_cast_435", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_435", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention.wq", - "name": "alias_default_1566", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_271", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_1496", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_451", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_1497", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_452", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_1498", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1496", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_1161", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1161", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1498", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_546", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1497", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_454", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_547", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_546", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_1162", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_547", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_1163", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1163", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1162", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_548", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_548", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "sum_97", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1163", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "div_64", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_97", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_549", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1162", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_549", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "sub_48", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_48", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_454", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_550", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1161", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1163", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "mul_551", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_551", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "sum_98", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_550", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_1499", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_98", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "convert_element_type_1500", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1499", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "add_272", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1500", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "dtype_cast_436", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_436", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.16.attention_norm", - "name": "alias_default_1573", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_272", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "alias_default_1164", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1164", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_449", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "einsum_default_451", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_450", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "permute_871", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1164", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_871", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "einsum_default_452", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_451", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "permute_872", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_872", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "dtype_cast_437", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_437", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "alias_default_1562", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_452", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w2", - "name": "alias_default_1165", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_446", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_552", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1165", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_448", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_553", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_552", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_1166", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1166", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_442", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "einsum_default_453", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_447", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "permute_875", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1166", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_875", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "einsum_default_454", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_453", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "permute_876", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_876", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "dtype_cast_438", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_438", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w3", - "name": "alias_default_1563", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_553", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "convert_element_type_1509", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_444", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "convert_element_type_1510", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1510", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_1167", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1167", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "neg_48", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "exp_48", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "add_273", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_273", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "reciprocal_16", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_16", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_554", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_554", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_1168", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1509", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_555", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "sub_49", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1167", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_556", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_556", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "add_274", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_555", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_274", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "mul_557", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_557", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "convert_element_type_1511", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1511", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward", - "name": "alias_default_1169", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1169", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_442", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "einsum_default_455", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_443", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "permute_879", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1169", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_879", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "einsum_default_456", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_454", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_456", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15", - "name": "add_275", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_455", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "permute_880", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_880", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "dtype_cast_439", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_439", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.feed_forward.w1", - "name": "alias_default_1561", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_275", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_1516", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_438", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_1517", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_439", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_1518", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1516", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_1170", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1170", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1518", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_558", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1517", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_441", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_559", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_558", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_1171", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_559", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_1172", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_560", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_560", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "sum_99", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "div_65", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_65", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_99", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_561", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_561", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "sub_50", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_50", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_441", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_562", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1170", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "mul_563", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_563", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "sum_100", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_562", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_1519", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_100", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "convert_element_type_1520", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1164", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1519", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "add_276", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1520", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "dtype_cast_440", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_440", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.ffn_norm", - "name": "alias_default_1565", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_276", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "alias_default_1173", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_436", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "einsum_default_457", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_437", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "permute_883", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_883", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "einsum_default_458", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_457", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "permute_884", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_884", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "dtype_cast_441", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_441", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wo", - "name": "alias_default_1560", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_458", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1196", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1196", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_885", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_885", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_432", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_433", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_434", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_435", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_136", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_141", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_142", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_16", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_336", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_337", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.sdpa", - "name": "getitem_338", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_338", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_886", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_337", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_887", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_336", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "permute_888", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_886", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1197", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "sum_101", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "squeeze_32", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_887", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1198", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1198", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "sum_102", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "squeeze_33", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_33", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_1525", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_888", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_1526", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1525", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1199", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1199", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_complex_96", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_431", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "_conj_32", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_32", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "clone_198", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_96", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_198", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "mul_564", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1526", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1200", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1200", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_complex_97", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_431", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "_conj_33", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_33", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "clone_199", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_97", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_199", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "mul_565", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_564", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_real_96", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_96", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1201", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1201", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_1527", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_565", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_as_real_97", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_97", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1202", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1202", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "convert_element_type_1528", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_32", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1203", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1527", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1204", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1528", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "view_1205", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1203", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_1174", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1174", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_427", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "einsum_default_459", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_430", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "permute_891", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1174", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_891", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "einsum_default_460", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_459", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "permute_892", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_892", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "dtype_cast_442", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_442", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wv", - "name": "alias_default_1559", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1204", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_1175", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1175", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_427", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "einsum_default_461", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_429", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "permute_895", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1175", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_895", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "einsum_default_462", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_460", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_462", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "add_277", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_461", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "permute_896", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_896", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "dtype_cast_443", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_443", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wk", - "name": "alias_default_1558", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1205", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention", - "name": "alias_default_1176", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1176", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_427", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "einsum_default_463", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_428", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "permute_899", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1176", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_899", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "einsum_default_464", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_277", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_464", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15", - "name": "add_278", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_463", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "permute_900", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_900", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "dtype_cast_444", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_444", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention.wq", - "name": "alias_default_1557", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_278", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_1541", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_423", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_1542", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_424", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_1543", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1541", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_1177", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1177", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1543", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_566", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1542", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_426", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_567", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_566", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_1178", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_567", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_1179", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1179", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1178", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_568", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_568", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "sum_103", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1179", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "div_66", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_66", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_103", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_569", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1178", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_569", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "sub_51", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_51", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_426", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_570", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1177", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1179", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "mul_571", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_571", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "sum_104", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_570", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_1544", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_104", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "convert_element_type_1545", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1544", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "add_279", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1545", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "dtype_cast_445", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_445", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.15.attention_norm", - "name": "alias_default_1564", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_279", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "alias_default_1180", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1180", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_421", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "einsum_default_465", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_422", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "permute_903", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1180", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_903", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "einsum_default_466", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_465", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "permute_904", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_904", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "dtype_cast_446", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_446", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "alias_default_1553", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_466", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w2", - "name": "alias_default_1181", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1181", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_418", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_572", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1181", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_420", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_573", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_572", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_1182", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1182", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_414", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "einsum_default_467", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_419", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "permute_907", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1182", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_907", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "einsum_default_468", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_467", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "permute_908", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_908", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "dtype_cast_447", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_447", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w3", - "name": "alias_default_1554", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_573", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "convert_element_type_1554", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_416", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "convert_element_type_1555", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1555", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_1183", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1183", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "neg_49", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "exp_49", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "add_280", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_280", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "reciprocal_17", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_17", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_574", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_574", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_1184", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1554", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1184", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_575", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1184", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "sub_52", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1183", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_576", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_576", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "add_281", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_575", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_281", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "mul_577", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_577", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "convert_element_type_1556", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1556", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward", - "name": "alias_default_1185", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1185", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_414", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "einsum_default_469", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_415", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "permute_911", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1185", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_911", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "einsum_default_470", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_468", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_470", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14", - "name": "add_282", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_469", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "permute_912", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_912", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "dtype_cast_448", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_448", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.feed_forward.w1", - "name": "alias_default_1552", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_282", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_1561", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_410", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_1562", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_411", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_1563", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1561", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_1186", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1563", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_578", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1562", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_413", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_579", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_578", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_1187", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_579", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_1188", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1188", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1187", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_580", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_580", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "sum_105", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1188", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "div_67", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_67", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_105", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_581", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1187", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_581", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "sub_53", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_53", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_413", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_582", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1188", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "mul_583", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_583", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "sum_106", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_582", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_1564", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_106", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "convert_element_type_1565", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1180", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1564", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "add_283", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1565", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "dtype_cast_449", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_449", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.ffn_norm", - "name": "alias_default_1556", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "alias_default_1189", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_408", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "einsum_default_471", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_409", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "permute_915", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_915", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "einsum_default_472", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_471", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "permute_916", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_916", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "dtype_cast_450", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_450", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wo", - "name": "alias_default_1551", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_472", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1220", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1220", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_917", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_917", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_404", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_405", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_406", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_407", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_132", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_133", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_17", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_339", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_340", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_17", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.sdpa", - "name": "getitem_341", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_341", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_918", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_340", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_919", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "permute_920", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_918", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1221", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1221", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "sum_107", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "squeeze_34", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_919", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1222", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1222", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "sum_108", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "squeeze_35", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_35", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_1570", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_920", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_1571", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1570", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1223", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1223", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_complex_98", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_403", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "_conj_34", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_34", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "clone_206", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_98", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_206", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "mul_584", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1571", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1224", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_complex_99", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_403", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "_conj_35", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_35", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "clone_207", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_99", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_207", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "mul_585", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_584", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_real_98", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_98", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1225", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1225", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_1572", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_585", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_as_real_99", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_99", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1226", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1226", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "convert_element_type_1573", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_34", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1227", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1572", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1228", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1573", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "view_1229", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1227", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_1190", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1190", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_399", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "einsum_default_473", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_402", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "permute_923", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1190", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_923", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "einsum_default_474", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_473", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "permute_924", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_924", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "dtype_cast_451", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_451", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wv", - "name": "alias_default_1550", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1228", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_1191", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1191", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_399", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "einsum_default_475", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_401", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "permute_927", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1191", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_927", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "einsum_default_476", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_474", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_476", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "add_284", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_475", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "permute_928", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_928", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "dtype_cast_452", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_452", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wk", - "name": "alias_default_1549", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1229", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention", - "name": "alias_default_1192", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1192", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_399", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "einsum_default_477", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_400", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "permute_931", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1192", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_931", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "einsum_default_478", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_284", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_478", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14", - "name": "add_285", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_477", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "permute_932", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_932", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "dtype_cast_453", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_453", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention.wq", - "name": "alias_default_1548", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_285", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_1586", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_395", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_1587", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_396", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_1588", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1586", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_1193", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1193", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1588", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_586", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1587", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_398", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_587", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_586", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_1194", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_587", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_1195", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1195", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1194", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_588", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_588", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "sum_109", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1195", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "div_68", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_68", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_589", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1194", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_589", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "sub_54", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_54", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_398", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_590", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1193", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1195", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "mul_591", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_591", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "sum_110", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_590", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_1589", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_110", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "convert_element_type_1590", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1589", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "add_286", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1590", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "dtype_cast_454", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_454", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.14.attention_norm", - "name": "alias_default_1555", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_286", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "alias_default_1196", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1196", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "einsum_default_479", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_394", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "permute_935", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1196", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_935", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "einsum_default_480", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_479", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "permute_936", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_936", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "dtype_cast_455", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_455", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "alias_default_1544", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_480", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w2", - "name": "alias_default_1197", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_592", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_593", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_592", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_1198", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1198", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_386", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "einsum_default_481", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_391", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "permute_939", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1198", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_939", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "einsum_default_482", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_481", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "permute_940", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_940", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "dtype_cast_456", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_456", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w3", - "name": "alias_default_1545", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_593", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "convert_element_type_1599", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_388", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "convert_element_type_1600", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1600", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_1199", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1199", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "neg_50", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "exp_50", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "add_287", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_287", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "reciprocal_18", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_18", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_594", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_594", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_1200", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1599", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1200", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_595", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1200", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "sub_55", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1199", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_596", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_596", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "add_288", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_595", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_288", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "mul_597", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_597", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "convert_element_type_1601", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1601", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward", - "name": "alias_default_1201", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1201", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_386", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "einsum_default_483", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_387", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "permute_943", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1201", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_943", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "einsum_default_484", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_482", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_484", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13", - "name": "add_289", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_483", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "permute_944", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_944", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "dtype_cast_457", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_457", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.feed_forward.w1", - "name": "alias_default_1543", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_289", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_1606", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_1607", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_383", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_1608", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1606", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_1202", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1608", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_598", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1607", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_385", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_599", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_598", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_1203", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_599", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_1204", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1204", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1203", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_600", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_600", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "sum_111", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1204", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "div_69", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_111", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_601", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1203", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_601", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "sub_56", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_56", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_385", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_602", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1204", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "mul_603", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_603", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "sum_112", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_602", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_1609", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_112", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "convert_element_type_1610", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1196", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1609", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "add_290", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1610", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "dtype_cast_458", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_458", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.ffn_norm", - "name": "alias_default_1547", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_290", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "alias_default_1205", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1205", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "einsum_default_485", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_381", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "permute_947", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1205", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_947", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "einsum_default_486", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_485", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "permute_948", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_948", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "dtype_cast_459", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_459", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wo", - "name": "alias_default_1542", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_486", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1244", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1244", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_949", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_949", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_376", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_377", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_379", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_118", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_123", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_124", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_18", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_342", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_343", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.sdpa", - "name": "getitem_344", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_344", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_950", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_343", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_951", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_342", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "permute_952", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_950", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1245", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1245", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "sum_113", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "squeeze_36", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_951", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1246", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1246", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "sum_114", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "squeeze_37", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_37", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_1615", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_952", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_1616", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1615", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1247", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1247", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_complex_100", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_375", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "_conj_36", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_36", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "clone_214", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_100", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_214", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "mul_604", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1616", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1248", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1248", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_complex_101", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_375", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "_conj_37", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_37", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "clone_215", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_215", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "mul_605", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_604", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_real_100", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_100", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1249", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1249", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_1617", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_605", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_as_real_101", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_101", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1250", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1250", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "convert_element_type_1618", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_36", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1251", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1617", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1252", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1618", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "view_1253", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1251", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_1206", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1206", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_371", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "einsum_default_487", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_374", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "permute_955", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1206", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_955", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "einsum_default_488", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_487", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "permute_956", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_956", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "dtype_cast_460", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_460", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wv", - "name": "alias_default_1541", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1252", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_1207", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1207", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_371", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "einsum_default_489", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_373", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "permute_959", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1207", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_959", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "einsum_default_490", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_488", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_490", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "add_291", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_489", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "permute_960", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_960", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "dtype_cast_461", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_461", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wk", - "name": "alias_default_1540", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1253", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention", - "name": "alias_default_1208", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1208", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_371", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "einsum_default_491", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_372", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "permute_963", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1208", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_963", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "einsum_default_492", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_492", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13", - "name": "add_292", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_491", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "permute_964", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_964", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "dtype_cast_462", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_462", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention.wq", - "name": "alias_default_1539", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_292", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_1631", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_367", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_1632", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_368", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_1633", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1631", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_1209", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1209", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1633", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_606", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1632", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_607", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_606", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_1210", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_607", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_1211", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1211", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1210", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_608", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_608", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "sum_115", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1211", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "div_70", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_609", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1210", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_609", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "sub_57", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_57", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_610", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1209", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1211", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "mul_611", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_611", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "sum_116", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_610", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_1634", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_116", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "convert_element_type_1635", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1205", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1634", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "add_293", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1635", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "dtype_cast_463", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_463", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.13.attention_norm", - "name": "alias_default_1546", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_293", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "alias_default_1212", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1212", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_365", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "einsum_default_493", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_366", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "permute_967", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1212", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_967", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "einsum_default_494", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_493", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "permute_968", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_968", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "dtype_cast_464", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_464", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "alias_default_1535", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_494", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w2", - "name": "alias_default_1213", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1213", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_362", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_612", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1213", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_364", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_613", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_612", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_1214", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1214", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_358", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "einsum_default_495", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_363", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "permute_971", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1214", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_971", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "einsum_default_496", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_495", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "permute_972", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_972", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "dtype_cast_465", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_465", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w3", - "name": "alias_default_1536", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_613", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "convert_element_type_1644", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_360", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "convert_element_type_1645", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1645", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_1215", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1215", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "neg_51", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "exp_51", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "add_294", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_294", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "reciprocal_19", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_19", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_614", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_614", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_1216", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1644", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1216", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_615", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1216", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "sub_58", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1215", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_616", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_616", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "add_295", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_615", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_295", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "mul_617", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_617", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "convert_element_type_1646", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1646", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward", - "name": "alias_default_1217", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1217", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_358", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "einsum_default_497", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_359", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "permute_975", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1217", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_975", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "einsum_default_498", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_496", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_498", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12", - "name": "add_296", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_497", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "permute_976", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_976", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "dtype_cast_466", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_466", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.feed_forward.w1", - "name": "alias_default_1534", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_296", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_1651", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_1652", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_355", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_1653", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1651", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_1218", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1218", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1653", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_618", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1652", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_357", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_619", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_618", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_1219", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_619", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_1220", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1220", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1219", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_620", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_620", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "sum_117", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1220", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "div_71", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_117", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_621", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1219", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_621", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "sub_59", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_357", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_622", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1218", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1220", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "mul_623", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_623", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "sum_118", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_622", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_1654", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_118", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "convert_element_type_1655", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1212", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1654", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "add_297", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1655", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "dtype_cast_467", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_467", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.ffn_norm", - "name": "alias_default_1538", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_297", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "alias_default_1221", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1221", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_352", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "einsum_default_499", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_353", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "permute_979", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1221", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_979", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "einsum_default_500", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_499", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "permute_980", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_980", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "dtype_cast_468", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_468", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wo", - "name": "alias_default_1533", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_500", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1268", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1268", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_981", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_981", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_350", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_351", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_114", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_115", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_19", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_345", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_346", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.sdpa", - "name": "getitem_347", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_347", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_982", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_346", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_983", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_345", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "permute_984", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_982", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1269", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1269", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "sum_119", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_119", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "squeeze_38", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_983", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1270", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1270", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "sum_120", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "squeeze_39", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_39", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_1660", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_984", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_1661", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1660", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1271", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1271", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_complex_102", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_347", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "_conj_38", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_38", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "clone_222", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_222", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "mul_624", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1661", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1272", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1272", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_complex_103", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_347", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "_conj_39", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_39", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "clone_223", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_223", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "mul_625", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_624", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_real_102", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_102", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1273", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1273", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_1662", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_625", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_as_real_103", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_103", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1274", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1274", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "convert_element_type_1663", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_38", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1275", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1662", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1276", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1663", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "view_1277", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1275", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_1222", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1222", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_343", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "einsum_default_501", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_346", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "permute_987", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1222", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_987", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "einsum_default_502", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_501", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "permute_988", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_988", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "dtype_cast_469", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_469", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wv", - "name": "alias_default_1532", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1276", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_1223", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1223", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_343", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "einsum_default_503", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_345", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "permute_991", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1223", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_991", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "einsum_default_504", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_502", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_504", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "add_298", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_503", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "permute_992", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_992", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "dtype_cast_470", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_470", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wk", - "name": "alias_default_1531", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1277", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention", - "name": "alias_default_1224", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_343", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "einsum_default_505", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_344", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "permute_995", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1224", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_995", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "einsum_default_506", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_506", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12", - "name": "add_299", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_505", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "permute_996", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_996", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "dtype_cast_471", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_471", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention.wq", - "name": "alias_default_1530", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_299", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_1676", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_1677", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_340", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_1678", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1676", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_1225", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1225", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1678", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_626", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1677", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_342", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_627", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_626", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_1226", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_627", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_1227", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1226", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_628", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_628", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "sum_121", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "div_72", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_121", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_629", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1226", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_629", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "sub_60", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_60", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_342", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_630", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1225", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "mul_631", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_631", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "sum_122", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_630", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_1679", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_122", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "convert_element_type_1680", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1221", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1679", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "add_300", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1680", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "dtype_cast_472", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_472", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.12.attention_norm", - "name": "alias_default_1537", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_300", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "alias_default_1228", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1228", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_337", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "einsum_default_507", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_338", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "permute_999", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1228", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_999", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "einsum_default_508", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_507", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "permute_1000", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1000", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "dtype_cast_473", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_473", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "alias_default_1526", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_508", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w2", - "name": "alias_default_1229", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1229", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_334", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_632", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1229", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_336", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_633", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_632", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_1230", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1230", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_330", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "einsum_default_509", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_335", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "permute_1003", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1230", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1003", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "einsum_default_510", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_509", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "permute_1004", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1004", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "dtype_cast_474", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_474", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w3", - "name": "alias_default_1527", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_633", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "convert_element_type_1689", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_332", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "convert_element_type_1690", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1690", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_1231", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1231", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "neg_52", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "exp_52", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "add_301", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_301", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "reciprocal_20", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_20", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_634", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_634", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_1232", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1689", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1232", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_635", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1232", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "sub_61", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1231", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_636", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_636", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "add_302", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_635", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_302", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "mul_637", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_637", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "convert_element_type_1691", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1691", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward", - "name": "alias_default_1233", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1233", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_330", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "einsum_default_511", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_331", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "permute_1007", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1233", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1007", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "einsum_default_512", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_510", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_512", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11", - "name": "add_303", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_511", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "permute_1008", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1008", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "dtype_cast_475", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_475", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.feed_forward.w1", - "name": "alias_default_1525", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_303", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_1696", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_1697", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_327", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_1698", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1696", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_1234", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1234", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1698", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_638", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1697", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_329", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_639", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_638", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_1235", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_639", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_1236", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1235", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_640", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_640", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "sum_123", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "div_73", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_73", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_123", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_641", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1235", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_641", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "sub_62", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_329", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_642", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1234", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "mul_643", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_643", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "sum_124", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_642", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_1699", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_124", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "convert_element_type_1700", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1228", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1699", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "add_304", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1700", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "dtype_cast_476", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_476", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.ffn_norm", - "name": "alias_default_1529", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_304", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "alias_default_1237", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1237", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_324", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "einsum_default_513", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_325", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "permute_1011", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1237", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1011", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "einsum_default_514", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_513", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "permute_1012", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1012", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "dtype_cast_477", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_477", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wo", - "name": "alias_default_1524", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_514", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1292", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1292", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_1013", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1013", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_320", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_321", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_100", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_105", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_106", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_20", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_348", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_349", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_20", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.sdpa", - "name": "getitem_350", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_350", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_1014", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_1015", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "permute_1016", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1014", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1293", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1293", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "sum_125", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_125", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "squeeze_40", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1015", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1294", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1294", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "sum_126", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_126", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "squeeze_41", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_41", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_1705", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1016", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_1706", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1705", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1295", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1295", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_complex_104", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_319", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "_conj_40", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_40", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "clone_230", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_230", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "mul_644", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1706", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1296", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1296", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_complex_105", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_319", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "_conj_41", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_41", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "clone_231", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_231", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "mul_645", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_644", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_real_104", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_104", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1297", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1297", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_1707", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_645", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_as_real_105", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_105", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1298", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1298", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "convert_element_type_1708", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_40", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1299", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1707", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1300", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1708", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "view_1301", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1299", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_1238", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1238", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_315", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "einsum_default_515", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_318", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "permute_1019", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1238", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1019", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "einsum_default_516", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_515", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "permute_1020", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1020", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "dtype_cast_478", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_478", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wv", - "name": "alias_default_1523", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1300", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_1239", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1239", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_315", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "einsum_default_517", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_317", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "permute_1023", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1239", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1023", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "einsum_default_518", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_516", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_518", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "add_305", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_517", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "permute_1024", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1024", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "dtype_cast_479", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_479", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wk", - "name": "alias_default_1522", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1301", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention", - "name": "alias_default_1240", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1240", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_315", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "einsum_default_519", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_316", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "permute_1027", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1240", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1027", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "einsum_default_520", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_305", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_520", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11", - "name": "add_306", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_519", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "permute_1028", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1028", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "dtype_cast_480", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_480", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention.wq", - "name": "alias_default_1521", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_306", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_1721", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_1722", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_312", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_1723", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1721", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_1241", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1241", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1723", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_646", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1722", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_314", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_647", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_646", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_1242", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_647", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_1243", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1243", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_648", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_648", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "sum_127", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1243", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "div_74", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_649", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_649", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "sub_63", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_63", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_314", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_650", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1241", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1243", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "mul_651", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_651", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "sum_128", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_650", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_1724", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_128", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "convert_element_type_1725", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1237", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1724", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "add_307", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1725", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "dtype_cast_481", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_481", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.11.attention_norm", - "name": "alias_default_1528", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_307", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "alias_default_1244", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1244", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_309", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "einsum_default_521", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_310", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "permute_1031", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1244", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1031", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "einsum_default_522", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_521", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "permute_1032", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1032", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "dtype_cast_482", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_482", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "alias_default_1517", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_522", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w2", - "name": "alias_default_1245", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1245", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_306", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_652", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1245", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_308", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_653", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_652", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_1246", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1246", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_302", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "einsum_default_523", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_307", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "permute_1035", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1246", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1035", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "einsum_default_524", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_523", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "permute_1036", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1036", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "dtype_cast_483", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_483", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w3", - "name": "alias_default_1518", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_653", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "convert_element_type_1734", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_304", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "convert_element_type_1735", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1735", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_1247", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1247", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "neg_53", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "exp_53", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "add_308", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_308", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "reciprocal_21", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_21", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_654", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_654", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_1248", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1734", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1248", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_655", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1248", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "sub_64", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1247", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_64", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_656", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_656", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "add_309", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_655", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_309", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "mul_657", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_657", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "convert_element_type_1736", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1736", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward", - "name": "alias_default_1249", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1249", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_302", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "einsum_default_525", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_303", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "permute_1039", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1249", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1039", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "einsum_default_526", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_524", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_526", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10", - "name": "add_310", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_525", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "permute_1040", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1040", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "dtype_cast_484", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_484", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.feed_forward.w1", - "name": "alias_default_1516", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_310", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_1741", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_1742", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_299", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_1743", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1741", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_1250", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1250", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1743", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_658", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1742", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_659", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_658", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_1251", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_659", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_1252", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1252", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1251", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_660", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_660", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "sum_129", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1252", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "div_75", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_75", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_129", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_661", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1251", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_661", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "sub_65", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_65", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_662", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1250", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1252", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "mul_663", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_663", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "sum_130", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_662", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_1744", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_130", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "convert_element_type_1745", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1244", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1744", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "add_311", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1745", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "dtype_cast_485", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_485", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.ffn_norm", - "name": "alias_default_1520", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_311", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "alias_default_1253", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1253", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_296", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "einsum_default_527", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_297", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "permute_1043", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1253", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1043", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "einsum_default_528", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_527", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "permute_1044", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1044", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "dtype_cast_486", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_486", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wo", - "name": "alias_default_1515", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_528", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1316", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1316", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_1045", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1045", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_292", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_293", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_294", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_295", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_96", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_97", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_21", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_351", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_352", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.sdpa", - "name": "getitem_353", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_353", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_1046", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_352", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_1047", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_351", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "permute_1048", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1046", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1317", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1317", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "sum_131", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_131", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "squeeze_42", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1047", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1318", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1318", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "sum_132", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_132", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "squeeze_43", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_43", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_1750", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1048", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_1751", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1750", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1319", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1319", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_complex_106", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_291", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "_conj_42", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_42", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "clone_238", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_238", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "mul_664", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1751", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1320", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1320", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_complex_107", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_291", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "_conj_43", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_43", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "clone_239", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_239", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "mul_665", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_664", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_real_106", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_106", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1321", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1321", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_1752", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_665", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_as_real_107", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_107", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1322", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1322", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "convert_element_type_1753", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_42", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1323", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1752", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1324", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1753", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "view_1325", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1323", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_1254", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1254", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_287", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "einsum_default_529", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_290", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "permute_1051", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1254", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1051", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "einsum_default_530", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_529", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "permute_1052", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1052", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "dtype_cast_487", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_487", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wv", - "name": "alias_default_1514", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1324", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_1255", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1255", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_287", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "einsum_default_531", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_289", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "permute_1055", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1255", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1055", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "einsum_default_532", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_530", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_532", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "add_312", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_531", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "permute_1056", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1056", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "dtype_cast_488", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_488", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wk", - "name": "alias_default_1513", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1325", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention", - "name": "alias_default_1256", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1256", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_287", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "einsum_default_533", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_288", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "permute_1059", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1256", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1059", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "einsum_default_534", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_312", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_534", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10", - "name": "add_313", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_533", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "permute_1060", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1060", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "dtype_cast_489", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_489", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention.wq", - "name": "alias_default_1512", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_313", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_1766", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_1767", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_284", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_1768", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1766", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_1257", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1257", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1768", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_666", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1767", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_286", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_667", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_666", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_1258", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_667", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_1259", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1259", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_668", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_668", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "sum_133", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1259", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "div_76", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_76", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_133", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_669", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_669", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "sub_66", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_66", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_286", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_670", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1257", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1259", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "mul_671", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_671", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "sum_134", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_670", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_1769", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_134", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "convert_element_type_1770", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1253", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1769", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "add_314", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1770", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "dtype_cast_490", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_490", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.10.attention_norm", - "name": "alias_default_1519", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_314", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "alias_default_1260", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1260", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_281", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "einsum_default_535", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_282", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "permute_1063", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1260", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1063", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "einsum_default_536", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_535", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "permute_1064", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1064", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "dtype_cast_491", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_491", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "alias_default_1508", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_536", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w2", - "name": "alias_default_1261", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1261", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_278", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_672", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1261", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_280", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_673", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_672", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_1262", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1262", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_274", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "einsum_default_537", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_279", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "permute_1067", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1262", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1067", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "einsum_default_538", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_537", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "permute_1068", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1068", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "dtype_cast_492", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_492", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w3", - "name": "alias_default_1509", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_673", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "convert_element_type_1779", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_276", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "convert_element_type_1780", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1780", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_1263", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1263", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "neg_54", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "exp_54", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "add_315", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_315", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "reciprocal_22", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_22", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_674", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_674", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_1264", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1779", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1264", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_675", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1264", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "sub_67", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1263", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_67", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_676", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_676", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "add_316", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_675", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_316", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "mul_677", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_677", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "convert_element_type_1781", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1781", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward", - "name": "alias_default_1265", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1265", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_274", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "einsum_default_539", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_275", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "permute_1071", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1265", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1071", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "einsum_default_540", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_538", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_540", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9", - "name": "add_317", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_539", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "permute_1072", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1072", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "dtype_cast_493", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_493", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.feed_forward.w1", - "name": "alias_default_1507", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_317", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_1786", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_270", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_1787", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_271", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_1788", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1786", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_1266", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1788", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_678", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1787", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_273", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_679", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_678", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_1267", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_679", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_1268", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1267", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_680", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_680", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "sum_135", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "div_77", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_135", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_681", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1267", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_681", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "sub_68", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_68", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_273", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_682", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "mul_683", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_683", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "sum_136", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_682", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_1789", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_136", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "convert_element_type_1790", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1260", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1789", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "add_318", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1790", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "dtype_cast_494", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_494", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.ffn_norm", - "name": "alias_default_1511", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_318", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "alias_default_1269", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_268", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "einsum_default_541", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_269", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "permute_1075", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1075", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "einsum_default_542", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_541", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "permute_1076", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1076", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "dtype_cast_495", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_495", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wo", - "name": "alias_default_1506", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_542", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1340", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1340", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_1077", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1077", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_264", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_265", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_266", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_267", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_82", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_87", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_88", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_22", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_354", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_355", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_22", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.sdpa", - "name": "getitem_356", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_356", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_1078", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_355", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_1079", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "permute_1080", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1078", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1341", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1341", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "sum_137", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_137", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "squeeze_44", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1079", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1342", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1342", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "sum_138", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "squeeze_45", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_45", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_1795", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1080", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_1796", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1795", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1343", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1343", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_complex_108", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_263", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "_conj_44", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_44", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "clone_246", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_246", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "mul_684", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1796", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1344", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1344", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_complex_109", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_263", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "_conj_45", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_45", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "clone_247", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_247", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "mul_685", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_684", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_real_108", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1345", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1345", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_1797", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_685", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_as_real_109", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_109", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1346", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1346", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "convert_element_type_1798", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_44", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1347", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1797", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1348", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1798", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "view_1349", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1347", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_1270", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1270", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_259", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "einsum_default_543", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_262", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "permute_1083", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1270", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1083", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "einsum_default_544", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_543", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "permute_1084", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1084", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "dtype_cast_496", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_496", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wv", - "name": "alias_default_1505", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1348", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_1271", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1271", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_259", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "einsum_default_545", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_261", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "permute_1087", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1271", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1087", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "einsum_default_546", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_544", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_546", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "add_319", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_545", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "permute_1088", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1088", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "dtype_cast_497", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_497", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wk", - "name": "alias_default_1504", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1349", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention", - "name": "alias_default_1272", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1272", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_259", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "einsum_default_547", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_260", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "permute_1091", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1272", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1091", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "einsum_default_548", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_319", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_548", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9", - "name": "add_320", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_547", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "permute_1092", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1092", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "dtype_cast_498", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_498", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention.wq", - "name": "alias_default_1503", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_320", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_1811", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_255", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_1812", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_256", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_1813", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1811", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_1273", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1273", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1813", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_686", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1812", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_687", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_686", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_1274", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_687", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_1275", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1275", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1274", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_688", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_688", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "sum_139", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1275", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "div_78", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_78", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_139", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_689", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1274", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_689", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "sub_69", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_258", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_690", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1273", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1275", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "mul_691", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_691", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "sum_140", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_690", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_1814", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_140", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "convert_element_type_1815", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1814", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "add_321", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1815", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "dtype_cast_499", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_499", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.9.attention_norm", - "name": "alias_default_1510", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_321", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "alias_default_1276", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1276", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_253", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "einsum_default_549", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_254", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "permute_1095", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1276", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1095", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "einsum_default_550", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_549", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "permute_1096", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1096", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "dtype_cast_500", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_500", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "alias_default_1499", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_550", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w2", - "name": "alias_default_1277", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1277", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_250", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_692", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1277", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_252", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_693", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_692", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_1278", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1278", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_246", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "einsum_default_551", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_251", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "permute_1099", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1278", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1099", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "einsum_default_552", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_551", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "permute_1100", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1100", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "dtype_cast_501", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_501", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w3", - "name": "alias_default_1500", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_693", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "convert_element_type_1824", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_248", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "convert_element_type_1825", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1825", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_1279", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1279", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "neg_55", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "exp_55", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "add_322", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_322", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "reciprocal_23", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_23", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_694", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_694", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_1280", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1824", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1280", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_695", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1280", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "sub_70", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1279", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_70", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_696", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_696", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "add_323", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_695", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_323", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "mul_697", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_697", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "convert_element_type_1826", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1826", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward", - "name": "alias_default_1281", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1281", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_246", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "einsum_default_553", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_247", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "permute_1103", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1281", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1103", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "einsum_default_554", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_552", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_554", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8", - "name": "add_324", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_553", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "permute_1104", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1104", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "dtype_cast_502", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_502", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.feed_forward.w1", - "name": "alias_default_1498", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_324", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_1831", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_242", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_1832", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_243", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_1833", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1831", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_1282", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1282", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1833", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_698", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1832", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_245", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_699", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_698", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_1283", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_699", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_1284", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1284", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_700", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_700", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "sum_141", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1284", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "div_79", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_79", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_701", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1283", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_701", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "sub_71", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_245", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_702", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1282", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1284", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "mul_703", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_703", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "sum_142", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_702", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_1834", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_142", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "convert_element_type_1835", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1276", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1834", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "add_325", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1835", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "dtype_cast_503", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_503", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.ffn_norm", - "name": "alias_default_1502", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_325", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "alias_default_1285", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1285", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_240", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "einsum_default_555", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_241", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "permute_1107", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1285", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1107", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "einsum_default_556", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_555", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "permute_1108", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1108", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "dtype_cast_504", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_504", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wo", - "name": "alias_default_1497", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_556", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1364", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1364", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_1109", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1109", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_236", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_237", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_238", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_239", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_73", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_78", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_79", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_23", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_357", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_358", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_23", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.sdpa", - "name": "getitem_359", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_359", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_1110", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_358", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_1111", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_357", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "permute_1112", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1365", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1365", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "sum_143", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_143", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "squeeze_46", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1111", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1366", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1366", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "sum_144", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_144", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "squeeze_47", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_47", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_1840", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_1841", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1840", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1367", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1367", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_complex_110", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_235", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "_conj_46", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_46", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "clone_254", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_254", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "mul_704", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1841", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1368", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1368", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_complex_111", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_235", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "_conj_47", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_47", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "clone_255", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_111", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_255", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "mul_705", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_704", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_real_110", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1369", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1369", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_1842", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_705", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_as_real_111", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_111", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1370", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1370", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "convert_element_type_1843", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_46", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1371", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1842", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1372", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1843", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "view_1373", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1371", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_1286", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1286", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_231", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "einsum_default_557", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_234", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "permute_1115", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1286", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1115", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "einsum_default_558", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_557", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "permute_1116", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1116", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "dtype_cast_505", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_505", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wv", - "name": "alias_default_1496", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1372", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_1287", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1287", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_231", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "einsum_default_559", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_233", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "permute_1119", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1287", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1119", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "einsum_default_560", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_558", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_560", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "add_326", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_559", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "permute_1120", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1120", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "dtype_cast_506", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_506", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wk", - "name": "alias_default_1495", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1373", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention", - "name": "alias_default_1288", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1288", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_231", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "einsum_default_561", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_232", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "permute_1123", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1288", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1123", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "einsum_default_562", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_326", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_562", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8", - "name": "add_327", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_561", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "permute_1124", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1124", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "dtype_cast_507", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_507", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention.wq", - "name": "alias_default_1494", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_327", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_1856", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_227", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_1857", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_228", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_1858", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1856", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_1289", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1289", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1858", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_706", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1857", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_230", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_707", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_706", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_1290", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_707", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_1291", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1290", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_708", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_708", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "sum_145", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "div_80", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_80", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_145", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_709", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1290", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_709", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "sub_72", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_230", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_710", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1289", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1291", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "mul_711", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_711", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "sum_146", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_710", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_1859", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_146", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "convert_element_type_1860", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1285", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1859", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "add_328", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1860", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "dtype_cast_508", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_508", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.8.attention_norm", - "name": "alias_default_1501", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_328", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "alias_default_1292", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1292", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_225", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "einsum_default_563", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_226", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "permute_1127", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1292", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1127", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "einsum_default_564", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_563", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "permute_1128", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1128", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "dtype_cast_509", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_509", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "alias_default_1490", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_564", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w2", - "name": "alias_default_1293", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1293", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_222", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_712", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1293", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_224", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_713", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_712", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_1294", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1294", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_218", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "einsum_default_565", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_223", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "permute_1131", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1294", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1131", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "einsum_default_566", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_565", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "permute_1132", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1132", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "dtype_cast_510", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_510", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w3", - "name": "alias_default_1491", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_713", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "convert_element_type_1869", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_220", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "convert_element_type_1870", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1870", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_1295", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1295", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "neg_56", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "exp_56", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "add_329", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_329", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "reciprocal_24", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_714", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_714", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_1296", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1869", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1296", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_715", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1296", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "sub_73", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1295", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_73", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_716", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_716", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "add_330", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_715", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_330", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "mul_717", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_717", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "convert_element_type_1871", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1871", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward", - "name": "alias_default_1297", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1297", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_218", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "einsum_default_567", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_219", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "permute_1135", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1297", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1135", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "einsum_default_568", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_566", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_568", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7", - "name": "add_331", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_567", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "permute_1136", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1136", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "dtype_cast_511", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_511", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.feed_forward.w1", - "name": "alias_default_1489", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_331", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_1876", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_214", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_1877", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_215", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_1878", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1876", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_1298", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1878", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_718", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1877", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_217", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_719", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_718", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_1299", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_719", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_1300", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1299", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_720", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_720", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "sum_147", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "div_81", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_721", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1299", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_721", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "sub_74", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_217", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_722", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1298", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1300", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "mul_723", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_723", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "sum_148", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_722", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_1879", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_148", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "convert_element_type_1880", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1292", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1879", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "add_332", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1880", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "dtype_cast_512", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_512", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.ffn_norm", - "name": "alias_default_1493", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_332", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "alias_default_1301", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_212", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "einsum_default_569", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_213", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "permute_1139", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1139", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "einsum_default_570", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_569", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "permute_1140", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1140", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "dtype_cast_513", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_513", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wo", - "name": "alias_default_1488", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_570", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1388", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1388", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_1141", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1141", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_208", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_209", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_210", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_211", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_64", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_69", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_70", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_24", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_360", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_361", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_24", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.sdpa", - "name": "getitem_362", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_1142", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_361", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_1143", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_360", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "permute_1144", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1142", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1389", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1389", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "sum_149", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_149", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "squeeze_48", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1143", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1390", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "sum_150", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_150", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "squeeze_49", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_49", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_1885", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1144", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_1886", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1885", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1391", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1391", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_complex_112", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_207", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "_conj_48", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_48", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "clone_262", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_262", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "mul_724", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1886", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1392", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_complex_113", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_207", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "_conj_49", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_49", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "clone_263", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_263", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "mul_725", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_724", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_real_112", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1393", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_1887", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_725", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_as_real_113", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1394", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1394", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "convert_element_type_1888", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_48", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1395", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1887", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1396", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1888", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "view_1397", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1395", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_1302", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1302", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_203", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "einsum_default_571", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_206", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "permute_1147", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1302", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1147", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "einsum_default_572", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_571", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "permute_1148", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1148", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "dtype_cast_514", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_514", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wv", - "name": "alias_default_1487", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1396", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_1303", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1303", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_203", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "einsum_default_573", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_205", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "permute_1151", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1303", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "einsum_default_574", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_572", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_574", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "add_333", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_573", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "permute_1152", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1152", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "dtype_cast_515", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_515", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wk", - "name": "alias_default_1486", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1397", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention", - "name": "alias_default_1304", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1304", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_203", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "einsum_default_575", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_204", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "permute_1155", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1304", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1155", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "einsum_default_576", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_576", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7", - "name": "add_334", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_575", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "permute_1156", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1156", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "dtype_cast_516", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_516", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention.wq", - "name": "alias_default_1485", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_334", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_1901", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_199", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_1902", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_200", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_1903", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1901", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_1305", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1305", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1903", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_726", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1902", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_727", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_726", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_1306", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_727", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_1307", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1307", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1306", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_728", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_728", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "sum_151", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1307", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "div_82", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_82", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_151", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_729", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1306", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_729", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "sub_75", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_75", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_202", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_730", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1305", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1307", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "mul_731", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_731", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "sum_152", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_730", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_1904", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_152", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "convert_element_type_1905", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1904", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "add_335", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1905", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "dtype_cast_517", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_517", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.7.attention_norm", - "name": "alias_default_1492", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_335", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "alias_default_1308", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1308", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_197", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "einsum_default_577", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_198", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "permute_1159", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1308", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1159", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "einsum_default_578", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_577", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "permute_1160", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1160", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "dtype_cast_518", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_518", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "alias_default_1481", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_578", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w2", - "name": "alias_default_1309", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1309", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_194", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_732", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1309", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_196", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_733", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_732", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_1310", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1310", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_190", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "einsum_default_579", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_195", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "permute_1163", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1310", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1163", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "einsum_default_580", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_579", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "permute_1164", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1164", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "dtype_cast_519", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_519", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w3", - "name": "alias_default_1482", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_733", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "convert_element_type_1914", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_192", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "convert_element_type_1915", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1915", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_1311", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1311", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "neg_57", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "exp_57", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "add_336", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_336", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "reciprocal_25", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_25", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_734", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_734", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_1312", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1914", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1312", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_735", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1312", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "sub_76", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1311", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_76", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_736", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_736", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "add_337", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_735", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_337", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "mul_737", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_737", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "convert_element_type_1916", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1916", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward", - "name": "alias_default_1313", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1313", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_190", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "einsum_default_581", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_191", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "permute_1167", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1313", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1167", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "einsum_default_582", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_580", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_582", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6", - "name": "add_338", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_581", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "permute_1168", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1168", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "dtype_cast_520", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_520", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.feed_forward.w1", - "name": "alias_default_1480", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_338", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_1921", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_186", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_1922", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_187", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_1923", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1921", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_1314", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1314", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1923", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_738", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1922", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_739", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_738", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_1315", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_739", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_1316", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1316", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1315", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_740", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_740", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "sum_153", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1316", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "div_83", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_83", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_153", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_741", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1315", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_741", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "sub_77", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_742", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1314", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1316", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "mul_743", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_743", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "sum_154", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_742", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_1924", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_154", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "convert_element_type_1925", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1308", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1924", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "add_339", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1925", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "dtype_cast_521", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_521", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.ffn_norm", - "name": "alias_default_1484", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "alias_default_1317", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1317", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_184", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "einsum_default_583", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_185", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "permute_1171", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1317", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1171", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "einsum_default_584", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_583", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "permute_1172", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1172", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "dtype_cast_522", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_522", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wo", - "name": "alias_default_1479", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_584", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1412", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1412", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_1173", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1173", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_180", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_181", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_182", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_183", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_55", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_60", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_61", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_25", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_363", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_364", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_25", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.sdpa", - "name": "getitem_365", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_365", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_1174", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_364", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_1175", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_363", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "permute_1176", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1174", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1413", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1413", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "sum_155", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_155", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "squeeze_50", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1175", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1414", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1414", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "sum_156", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_156", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "squeeze_51", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_51", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_1930", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1176", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_1931", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1930", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1415", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1415", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_complex_114", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_179", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "_conj_50", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_50", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "clone_270", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_270", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "mul_744", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1931", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1416", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1416", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_complex_115", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_179", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "_conj_51", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_51", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "clone_271", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_115", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_271", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "mul_745", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_744", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_real_114", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_114", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1417", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1417", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_1932", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_745", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_as_real_115", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_115", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1418", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1418", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "convert_element_type_1933", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_50", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1419", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1932", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1420", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1933", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "view_1421", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1419", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_1318", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1318", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_175", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "einsum_default_585", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_178", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "permute_1179", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1318", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1179", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "einsum_default_586", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_585", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "permute_1180", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1180", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "dtype_cast_523", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_523", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wv", - "name": "alias_default_1478", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1420", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_1319", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1319", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_175", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "einsum_default_587", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_177", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "permute_1183", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1319", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1183", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "einsum_default_588", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_586", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_588", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "add_340", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_587", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "permute_1184", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1184", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "dtype_cast_524", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_524", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wk", - "name": "alias_default_1477", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1421", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention", - "name": "alias_default_1320", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1320", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_175", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "einsum_default_589", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_176", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "permute_1187", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1320", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1187", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "einsum_default_590", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_340", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_590", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6", - "name": "add_341", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_589", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "permute_1188", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1188", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "dtype_cast_525", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_525", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention.wq", - "name": "alias_default_1476", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_341", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_1946", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_1947", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_172", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_1948", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1946", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_1321", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1321", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1948", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_746", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1947", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_174", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_747", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_746", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_1322", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_747", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_1323", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_748", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_748", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "sum_157", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "div_84", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_84", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_157", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_749", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1322", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_749", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "sub_78", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_78", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_174", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_750", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1321", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1323", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "mul_751", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_751", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "sum_158", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_750", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_1949", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_158", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "convert_element_type_1950", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1317", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1949", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "add_342", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1950", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "dtype_cast_526", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_526", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.6.attention_norm", - "name": "alias_default_1483", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_342", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "alias_default_1324", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1324", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_169", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "einsum_default_591", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_170", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "permute_1191", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1324", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1191", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "einsum_default_592", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_591", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "permute_1192", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1192", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "dtype_cast_527", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_527", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "alias_default_1472", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_592", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w2", - "name": "alias_default_1325", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1325", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_166", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_752", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1325", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_753", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_752", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_1326", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1326", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_162", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "einsum_default_593", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_167", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "permute_1195", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1326", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1195", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "einsum_default_594", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_593", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "permute_1196", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1196", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "dtype_cast_528", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_528", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w3", - "name": "alias_default_1473", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_753", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "convert_element_type_1959", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_164", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "convert_element_type_1960", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1960", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_1327", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1327", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "neg_58", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "exp_58", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "add_343", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_343", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "reciprocal_26", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_754", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_754", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_1328", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1959", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1328", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_755", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1328", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "sub_79", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1327", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_79", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_756", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_756", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "add_344", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_755", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_344", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "mul_757", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_757", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "convert_element_type_1961", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1961", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward", - "name": "alias_default_1329", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1329", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_162", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "einsum_default_595", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_163", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "permute_1199", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1329", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1199", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "einsum_default_596", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_594", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_596", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5", - "name": "add_345", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_595", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "permute_1200", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1200", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "dtype_cast_529", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_529", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.feed_forward.w1", - "name": "alias_default_1471", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_345", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_1966", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_158", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_1967", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_159", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_1968", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1966", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_1330", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1330", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1968", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_758", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1967", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_161", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_759", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_758", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_1331", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_759", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_1332", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1332", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1331", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_760", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_760", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "sum_159", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1332", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "div_85", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_85", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_159", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_761", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1331", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_761", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "sub_80", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_80", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_161", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_762", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1330", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1332", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "mul_763", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_763", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "sum_160", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_762", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_1969", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_160", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "convert_element_type_1970", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1324", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1969", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "add_346", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1970", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "dtype_cast_530", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_530", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.ffn_norm", - "name": "alias_default_1475", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_346", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "alias_default_1333", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_156", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "einsum_default_597", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_157", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "permute_1203", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1203", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "einsum_default_598", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_597", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "permute_1204", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1204", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "dtype_cast_531", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_531", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wo", - "name": "alias_default_1470", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_598", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1436", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1436", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_1205", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1205", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_152", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_153", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_154", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_155", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_51", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_52", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_26", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_366", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_367", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_26", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.sdpa", - "name": "getitem_368", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_368", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_1206", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_367", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_1207", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_366", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "permute_1208", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1206", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1437", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1437", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "sum_161", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_161", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "squeeze_52", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1207", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1438", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1438", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "sum_162", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_162", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "squeeze_53", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_53", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_1975", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1208", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_1976", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1975", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1439", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1439", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_complex_116", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "_conj_52", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_52", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "clone_278", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_116", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_278", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "mul_764", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1976", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1440", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1440", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_complex_117", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_151", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "_conj_53", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_53", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "clone_279", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_117", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_279", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "mul_765", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_764", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_real_116", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_116", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1441", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1441", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_1977", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_765", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_as_real_117", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_117", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1442", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1442", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "convert_element_type_1978", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1443", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1977", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1444", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_1978", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "view_1445", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1443", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_1334", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1334", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_147", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "einsum_default_599", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_150", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "permute_1211", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1334", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1211", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "einsum_default_600", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_599", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "permute_1212", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1212", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "dtype_cast_532", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_532", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wv", - "name": "alias_default_1469", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1444", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_1335", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1335", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_147", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "einsum_default_601", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_149", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "permute_1215", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1335", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1215", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "einsum_default_602", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_600", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_602", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "add_347", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_601", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "permute_1216", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1216", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "dtype_cast_533", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_533", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wk", - "name": "alias_default_1468", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1445", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention", - "name": "alias_default_1336", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1336", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_147", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "einsum_default_603", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_148", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "permute_1219", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1336", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1219", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "einsum_default_604", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_347", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_604", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5", - "name": "add_348", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_603", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "permute_1220", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1220", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "dtype_cast_534", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_534", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention.wq", - "name": "alias_default_1467", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_1991", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_143", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_1992", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_144", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_1993", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1991", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_1337", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1337", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_1993", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_766", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1992", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_767", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_766", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_1338", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_767", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_1339", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1338", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_768", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_768", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "sum_163", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "div_86", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_86", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_163", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_769", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1338", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_769", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "sub_81", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_81", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_770", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1337", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1339", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "mul_771", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_771", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "sum_164", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_770", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_1994", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_164", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "convert_element_type_1995", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_1994", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "add_349", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_1995", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "dtype_cast_535", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_535", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.5.attention_norm", - "name": "alias_default_1474", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_349", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "alias_default_1340", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1340", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_141", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "einsum_default_605", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_142", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "permute_1223", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1340", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1223", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "einsum_default_606", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_605", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "permute_1224", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1224", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "dtype_cast_536", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_536", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "alias_default_1463", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_606", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w2", - "name": "alias_default_1341", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1341", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_138", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_772", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1341", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_140", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_773", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_772", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_1342", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1342", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_134", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "einsum_default_607", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_139", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "permute_1227", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1342", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1227", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "einsum_default_608", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_607", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "permute_1228", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1228", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "dtype_cast_537", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_537", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w3", - "name": "alias_default_1464", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_773", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "convert_element_type_2004", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_136", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "convert_element_type_2005", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2005", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_1343", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1343", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "neg_59", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "exp_59", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "add_350", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_350", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "reciprocal_27", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_27", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_774", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_774", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_1344", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2004", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1344", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_775", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1344", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "sub_82", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1343", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_776", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_776", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "add_351", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_775", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_351", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "mul_777", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_777", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "convert_element_type_2006", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2006", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward", - "name": "alias_default_1345", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1345", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_134", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "einsum_default_609", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_135", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "permute_1231", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1345", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1231", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "einsum_default_610", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_608", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_610", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4", - "name": "add_352", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_609", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "permute_1232", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1232", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "dtype_cast_538", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_538", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.feed_forward.w1", - "name": "alias_default_1462", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_352", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_2011", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_130", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_2012", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_131", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_2013", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2011", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_1346", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1346", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2013", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_778", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2012", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_133", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_779", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_778", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_1347", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_779", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_1348", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1347", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_780", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_780", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "sum_165", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "div_87", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_165", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_781", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1347", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_781", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "sub_83", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_83", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_133", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_782", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1346", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1348", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "mul_783", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_783", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "sum_166", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_782", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_2014", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_166", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "convert_element_type_2015", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1340", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2014", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "add_353", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2015", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "dtype_cast_539", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_539", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.ffn_norm", - "name": "alias_default_1466", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_353", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "alias_default_1349", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_128", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "einsum_default_611", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_129", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "permute_1235", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1235", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "einsum_default_612", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_611", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "permute_1236", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1236", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "dtype_cast_540", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_540", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wo", - "name": "alias_default_1461", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_612", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1460", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1460", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_1237", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1237", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_124", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_125", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_37", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_42", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_43", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_27", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_369", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_370", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_27", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.sdpa", - "name": "getitem_371", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_371", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_1238", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_1239", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "permute_1240", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1238", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1461", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1461", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "sum_167", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_167", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "squeeze_54", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1239", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1462", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1462", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "sum_168", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_168", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "squeeze_55", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_55", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_2020", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1240", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_2021", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2020", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1463", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1463", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_complex_118", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_123", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "_conj_54", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_54", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "clone_286", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_118", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_286", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "mul_784", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2021", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1464", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1464", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_complex_119", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_123", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "_conj_55", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_55", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "clone_287", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_119", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_287", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "mul_785", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_784", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_real_118", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_118", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1465", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1465", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_2022", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_785", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_as_real_119", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_119", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1466", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1466", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "convert_element_type_2023", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1467", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2022", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1468", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2023", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "view_1469", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1467", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_1350", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1350", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_119", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "einsum_default_613", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_122", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "permute_1243", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1350", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1243", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "einsum_default_614", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_613", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "permute_1244", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1244", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "dtype_cast_541", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_541", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wv", - "name": "alias_default_1460", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1468", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_1351", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1351", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_119", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "einsum_default_615", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_121", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "permute_1247", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1351", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1247", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "einsum_default_616", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_614", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_616", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "add_354", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_615", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "permute_1248", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1248", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "dtype_cast_542", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_542", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wk", - "name": "alias_default_1459", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1469", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention", - "name": "alias_default_1352", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1352", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_119", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "einsum_default_617", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_120", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "permute_1251", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1352", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1251", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "einsum_default_618", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_618", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4", - "name": "add_355", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_617", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "permute_1252", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1252", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "dtype_cast_543", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_543", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention.wq", - "name": "alias_default_1458", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_355", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_2036", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_115", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_2037", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_116", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_2038", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2036", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_1353", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1353", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2038", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_786", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2037", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_118", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_787", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_786", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_1354", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_787", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_1355", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1355", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_788", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_788", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "sum_169", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1355", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "div_88", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_88", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_169", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_789", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1354", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_789", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "sub_84", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_84", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_118", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_790", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1353", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1355", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "mul_791", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_791", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "sum_170", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_790", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_2039", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_170", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "convert_element_type_2040", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1349", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2039", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "add_356", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2040", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "dtype_cast_544", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_544", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.4.attention_norm", - "name": "alias_default_1465", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_356", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "alias_default_1356", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1356", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "einsum_default_619", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_114", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "permute_1255", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1356", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1255", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "einsum_default_620", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_619", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "permute_1256", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1256", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "dtype_cast_545", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_545", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "alias_default_1454", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_620", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w2", - "name": "alias_default_1357", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1357", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_792", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1357", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_793", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_792", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_1358", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1358", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_106", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "einsum_default_621", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_111", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "permute_1259", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1358", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1259", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "einsum_default_622", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_621", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "permute_1260", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1260", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "dtype_cast_546", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_546", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w3", - "name": "alias_default_1455", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_793", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "convert_element_type_2049", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_108", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "convert_element_type_2050", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2050", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_1359", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1359", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "neg_60", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "exp_60", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "add_357", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_357", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "reciprocal_28", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_794", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_794", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_1360", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2049", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1360", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_795", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1360", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "sub_85", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1359", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_796", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_796", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "add_358", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_795", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_358", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "mul_797", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_797", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "convert_element_type_2051", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2051", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward", - "name": "alias_default_1361", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1361", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_106", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "einsum_default_623", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_107", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "permute_1263", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1361", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1263", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "einsum_default_624", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_622", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_624", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3", - "name": "add_359", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_623", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "permute_1264", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1264", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "dtype_cast_547", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_547", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.feed_forward.w1", - "name": "alias_default_1453", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_359", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_2056", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_2057", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_103", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_2058", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2056", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_1362", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2058", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_798", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2057", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_105", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_799", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_798", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_1363", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_799", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_1364", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1364", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1363", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_800", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_800", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "sum_171", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1364", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "div_89", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_89", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_801", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1363", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_801", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "sub_86", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_86", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_105", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_802", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1364", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "mul_803", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_803", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "sum_172", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_802", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_2059", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_172", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "convert_element_type_2060", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1356", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2059", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "add_360", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2060", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "dtype_cast_548", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_548", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.ffn_norm", - "name": "alias_default_1457", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_360", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "alias_default_1365", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1365", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_100", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "einsum_default_625", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_101", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "permute_1267", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1365", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1267", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "einsum_default_626", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_625", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "permute_1268", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1268", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "dtype_cast_549", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_549", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wo", - "name": "alias_default_1452", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_626", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1484", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1484", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_1269", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1269", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_97", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_98", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_99", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_33", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_34", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_28", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_372", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_373", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_28", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.sdpa", - "name": "getitem_374", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_374", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_1270", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_373", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_1271", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_372", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "permute_1272", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1270", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1485", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1485", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "sum_173", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_173", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "squeeze_56", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1271", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1486", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1486", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "sum_174", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_174", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "squeeze_57", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_2065", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1272", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_2066", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2065", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1487", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1487", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_complex_120", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_95", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "_conj_56", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_56", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "clone_294", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_294", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "mul_804", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2066", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1488", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1488", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_complex_121", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_95", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "_conj_57", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_57", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "clone_295", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_295", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "mul_805", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_804", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_real_120", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_120", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1489", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1489", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_2067", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_805", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_as_real_121", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_121", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1490", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1490", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "convert_element_type_2068", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1491", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2067", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1492", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2068", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "view_1493", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1491", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_1366", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1366", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_91", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "einsum_default_627", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_94", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "permute_1275", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1366", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1275", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "einsum_default_628", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_627", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "permute_1276", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1276", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "dtype_cast_550", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_550", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wv", - "name": "alias_default_1451", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1492", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_1367", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1367", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_91", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "einsum_default_629", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_93", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "permute_1279", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1367", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1279", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "einsum_default_630", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_628", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_630", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "add_361", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_629", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "permute_1280", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1280", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "dtype_cast_551", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_551", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wk", - "name": "alias_default_1450", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1493", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention", - "name": "alias_default_1368", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1368", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_91", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "einsum_default_631", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_92", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "permute_1283", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1368", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1283", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "einsum_default_632", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_361", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_632", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3", - "name": "add_362", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_631", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "permute_1284", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1284", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "dtype_cast_552", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_552", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention.wq", - "name": "alias_default_1449", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_362", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_2081", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_2082", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_88", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_2083", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2081", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_1369", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2083", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_806", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2082", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_807", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_806", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_1370", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_807", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_1371", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1371", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_808", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_808", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "sum_175", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1371", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "div_90", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_175", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_809", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1370", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_809", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "sub_87", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_87", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_810", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1371", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "mul_811", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_811", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "sum_176", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_810", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_2084", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_176", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "convert_element_type_2085", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1365", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2084", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "add_363", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2085", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "dtype_cast_553", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_553", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.3.attention_norm", - "name": "alias_default_1456", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_363", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "alias_default_1372", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1372", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_85", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "einsum_default_633", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "cluster_root": "permute_1319", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_86", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "permute_1287", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "cluster_root": "einsum_default_648", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1372", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1287", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "einsum_default_634", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_633", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "permute_1288", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1288", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "dtype_cast_554", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_554", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "alias_default_1445", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "cluster_root": "alias_default_1389", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_634", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w2", - "name": "alias_default_1373", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "cluster_root": "mul_832", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1373", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_82", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_812", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "cluster_root": "mul_833", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1373", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_84", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_813", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "cluster_root": "alias_default_1390", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_812", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_1374", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1374", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_78", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "einsum_default_635", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "cluster_root": "permute_1323", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_83", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "permute_1291", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "cluster_root": "einsum_default_650", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1374", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1291", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "einsum_default_636", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_635", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "permute_1292", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1292", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "dtype_cast_555", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_555", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w3", - "name": "alias_default_1446", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "cluster_root": "convert_element_type_2139", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_813", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "convert_element_type_2094", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "cluster_root": "convert_element_type_2140", - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_80", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "convert_element_type_2095", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "cluster_root": "alias_default_1391", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2095", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_1375", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "cluster_root": "neg_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1375", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "neg_61", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "cluster_root": "exp_62", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "exp_61", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "cluster_root": "add_371", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "add_364", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "cluster_root": "reciprocal_30", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_364", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "reciprocal_29", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "cluster_root": "mul_834", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_814", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "cluster_root": "alias_default_1392", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_814", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_1376", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "cluster_root": "mul_835", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2094", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1376", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_815", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "cluster_root": "sub_91", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1376", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "sub_88", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "cluster_root": "mul_836", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1375", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_88", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_816", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "cluster_root": "add_372", - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_816", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "add_365", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "cluster_root": "mul_837", - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_815", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_365", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "mul_817", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "cluster_root": "convert_element_type_2141", - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_817", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "convert_element_type_2096", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "cluster_root": "alias_default_1393", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2096", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward", - "name": "alias_default_1377", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1377", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_78", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "einsum_default_637", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "cluster_root": "permute_1327", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_79", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "permute_1295", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "cluster_root": "einsum_default_652", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1377", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1295", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "einsum_default_638", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_636", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_638", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2", - "name": "add_366", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_637", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "permute_1296", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1296", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "dtype_cast_556", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_556", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.feed_forward.w1", - "name": "alias_default_1444", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "cluster_root": "convert_element_type_2146", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_366", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_2101", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "cluster_root": "convert_element_type_2147", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_74", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_2102", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "cluster_root": "convert_element_type_2148", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_75", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_2103", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "cluster_root": "alias_default_1394", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2101", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_1378", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "cluster_root": "mul_838", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2103", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_818", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "cluster_root": "mul_839", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2102", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_819", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "cluster_root": "alias_default_1395", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_818", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_1379", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "cluster_root": "alias_default_1396", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_819", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_1380", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "cluster_root": "mul_840", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1379", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_820", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "cluster_root": "sum_183", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_820", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "sum_177", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "cluster_root": "div_93", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "div_91", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "cluster_root": "mul_841", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_91", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_177", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_821", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "cluster_root": "sub_92", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1379", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_821", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "sub_89", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "cluster_root": "mul_842", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_89", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_77", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_822", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "mul_823", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_823", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "sum_178", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "cluster_root": "convert_element_type_2149", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_822", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_2104", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_178", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "convert_element_type_2105", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1372", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2104", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "add_367", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2105", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "dtype_cast_557", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_557", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.ffn_norm", - "name": "alias_default_1448", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_367", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "alias_default_1381", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_72", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "einsum_default_639", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "cluster_root": "permute_1331", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_73", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "permute_1299", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "cluster_root": "einsum_default_654", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1299", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "einsum_default_640", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_639", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "permute_1300", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1300", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "dtype_cast_558", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_558", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wo", - "name": "alias_default_1443", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "cluster_root": "view_1532", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_640", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1508", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "cluster_root": "permute_1333", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1508", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_1301", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "cluster_root": "_scaled_dot_product_flash_attention_backward_30", - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1301", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_68", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_69", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_70", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_71", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_19", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_24", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_25", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_29", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "cluster_root": "getitem_378", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_375", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "cluster_root": "getitem_379", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_376", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "cluster_root": "getitem_380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_29", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.sdpa", - "name": "getitem_377", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "cluster_root": "permute_1334", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_377", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_1302", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "cluster_root": "permute_1335", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_376", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_1303", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "cluster_root": "permute_1336", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_375", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "permute_1304", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "cluster_root": "view_1533", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1302", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1509", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "cluster_root": "sum_185", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1509", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "sum_179", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "cluster_root": "squeeze_60", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_179", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "squeeze_58", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "cluster_root": "view_1534", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1303", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1510", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "cluster_root": "sum_186", - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1510", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "sum_180", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "cluster_root": "squeeze_61", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_180", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "squeeze_59", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "cluster_root": "convert_element_type_2155", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_59", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_2110", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "cluster_root": "convert_element_type_2156", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1304", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_2111", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "cluster_root": "view_1535", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2110", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1511", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "cluster_root": "view_as_complex_124", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1511", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_complex_122", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "cluster_root": "_conj_60", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_67", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "_conj_58", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "cluster_root": "clone_310", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_58", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "clone_302", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "cluster_root": "mul_844", - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_122", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_302", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "mul_824", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "cluster_root": "view_1536", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2111", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1512", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "cluster_root": "view_as_complex_125", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1512", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_complex_123", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "cluster_root": "_conj_61", - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_67", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "_conj_59", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "cluster_root": "clone_311", - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_59", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "clone_303", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "cluster_root": "mul_845", - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_123", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_303", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "mul_825", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "cluster_root": "view_as_real_124", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_824", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_real_122", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "cluster_root": "view_1537", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_122", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1513", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "cluster_root": "convert_element_type_2157", - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1513", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_2112", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "cluster_root": "view_as_real_125", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_825", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_as_real_123", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "cluster_root": "view_1538", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_123", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1514", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "cluster_root": "convert_element_type_2158", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1514", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "convert_element_type_2113", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "cluster_root": "view_1539", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_58", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1515", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "cluster_root": "view_1540", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2112", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1516", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "cluster_root": "view_1541", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2113", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "view_1517", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "cluster_root": "alias_default_1398", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1515", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_1382", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1382", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_63", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "einsum_default_641", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "cluster_root": "permute_1339", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_66", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "permute_1307", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "cluster_root": "einsum_default_656", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1382", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1307", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "einsum_default_642", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_641", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "permute_1308", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1308", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "dtype_cast_559", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_559", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wv", - "name": "alias_default_1442", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "cluster_root": "alias_default_1399", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1516", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_1383", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1383", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_63", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "einsum_default_643", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "cluster_root": "permute_1343", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_65", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "permute_1311", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "cluster_root": "einsum_default_658", - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1383", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1311", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "einsum_default_644", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_642", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_644", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "add_368", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_643", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "permute_1312", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1312", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "dtype_cast_560", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_560", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wk", - "name": "alias_default_1441", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "cluster_root": "alias_default_1400", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1517", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention", - "name": "alias_default_1384", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1384", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_63", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "einsum_default_645", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "cluster_root": "permute_1347", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_64", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "permute_1315", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "cluster_root": "einsum_default_660", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1384", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1315", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "einsum_default_646", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_368", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_646", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2", - "name": "add_369", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_645", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "permute_1316", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1316", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "dtype_cast_561", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_561", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention.wq", - "name": "alias_default_1440", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "cluster_root": "convert_element_type_2171", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_369", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_2126", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "cluster_root": "convert_element_type_2172", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_59", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_2127", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "cluster_root": "convert_element_type_2173", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_60", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_2128", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "cluster_root": "alias_default_1401", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2126", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_1385", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "cluster_root": "mul_846", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1385", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2128", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_826", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "cluster_root": "mul_847", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2127", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_827", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "cluster_root": "alias_default_1402", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_826", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_1386", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "cluster_root": "alias_default_1403", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_827", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_1387", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "cluster_root": "mul_848", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1387", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1386", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_828", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "cluster_root": "sum_187", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_828", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "sum_181", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "cluster_root": "div_94", - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1387", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "div_92", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "cluster_root": "mul_849", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_92", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_181", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_829", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "cluster_root": "sub_93", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1386", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_829", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "sub_90", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "cluster_root": "mul_850", - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_90", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_62", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_830", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1385", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1387", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "mul_831", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_831", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "sum_182", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "cluster_root": "convert_element_type_2174", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_830", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_2129", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_182", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "convert_element_type_2130", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2129", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "add_370", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2130", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "dtype_cast_562", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_562", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.2.attention_norm", - "name": "alias_default_1447", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_370", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "alias_default_1388", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "cluster_root": "einsum_default_661", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1388", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_57", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "einsum_default_647", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 113, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_58", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "permute_1319", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 114, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1388", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1319", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "einsum_default_648", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "cluster_root": "permute_1352", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_647", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "permute_1320", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "cluster_root": "dtype_cast_572", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1320", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "dtype_cast_563", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "cluster_root": "alias_default_1427", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_563", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "alias_default_1436", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 115, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_648", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w2", - "name": "alias_default_1389", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 116, - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1389", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_54", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_832", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 117, - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1389", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_56", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_833", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 118, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_832", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_1390", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "cluster_root": "einsum_default_663", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_50", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "einsum_default_649", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 119, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_55", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "permute_1323", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 120, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1390", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1323", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "einsum_default_650", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "cluster_root": "permute_1356", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_649", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "permute_1324", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "cluster_root": "dtype_cast_573", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1324", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "dtype_cast_564", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "cluster_root": "alias_default_1428", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_564", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w3", - "name": "alias_default_1437", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 121, - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_833", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "convert_element_type_2139", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 122, - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_52", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "convert_element_type_2140", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 123, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2140", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_1391", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 124, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1391", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "neg_62", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 125, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "exp_62", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 126, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "add_371", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 127, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_371", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "reciprocal_30", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 128, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_30", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_834", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 129, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_834", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_1392", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 130, - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2139", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_835", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 131, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1392", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "sub_91", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 132, - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1391", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_91", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_836", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 133, - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_836", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "add_372", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 134, - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_835", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_372", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "mul_837", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 135, - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_837", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "convert_element_type_2141", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 136, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2141", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward", - "name": "alias_default_1393", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "cluster_root": "einsum_default_665", - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_50", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "einsum_default_651", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 137, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_51", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "permute_1327", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 138, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1393", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1327", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "einsum_default_652", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 139, - "cluster_root": "add_163", - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_650", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_652", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1", - "name": "add_373", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "cluster_root": "permute_1360", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_651", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "permute_1328", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "cluster_root": "dtype_cast_574", - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1328", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "dtype_cast_565", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "cluster_root": "alias_default_1426", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_565", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.feed_forward.w1", - "name": "alias_default_1435", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 140, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_373", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_2146", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 141, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_46", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_2147", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 142, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_47", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_2148", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 143, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2146", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_1394", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 144, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1394", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2148", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_838", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 145, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2147", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_839", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 146, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_838", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_1395", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 147, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_839", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_1396", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 148, - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1396", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1395", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_840", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 149, - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_840", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "sum_183", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 150, - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1396", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "div_93", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 151, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_93", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_183", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_841", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 152, - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1395", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_841", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "sub_92", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 153, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_92", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_49", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_842", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "cluster_root": "mul_863", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1394", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1396", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "mul_843", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "cluster_root": "sum_190", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_843", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "sum_184", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 154, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_842", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_2149", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "cluster_root": "convert_element_type_2195", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_184", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "convert_element_type_2150", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 155, - "cluster_root": "add_164", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1388", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2149", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "add_374", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "cluster_root": "dtype_cast_575", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2150", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "dtype_cast_566", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "cluster_root": "alias_default_1430", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_566", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.ffn_norm", - "name": "alias_default_1439", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 156, - "cluster_root": "alias_default_917", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_374", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "alias_default_1397", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "cluster_root": "einsum_default_667", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1397", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_44", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "einsum_default_653", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 157, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_45", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "permute_1331", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 158, - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1397", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1331", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "einsum_default_654", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "cluster_root": "permute_1364", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_653", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "permute_1332", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "cluster_root": "dtype_cast_576", - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1332", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "dtype_cast_567", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "cluster_root": "alias_default_1425", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_567", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wo", - "name": "alias_default_1434", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 159, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_654", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1532", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 160, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1532", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_1333", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 161, - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1333", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_40", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_41", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_42", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_43", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_10", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_15", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_16", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_30", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 162, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_378", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 163, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_379", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 164, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_30", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.sdpa", - "name": "getitem_380", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 165, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_380", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_1334", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 166, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_379", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_1335", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 167, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_378", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "permute_1336", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 168, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1334", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1533", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 169, - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1533", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "sum_185", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 170, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_185", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "squeeze_60", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 171, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1335", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1534", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 172, - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1534", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "sum_186", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 173, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_186", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "squeeze_61", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 174, - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_61", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_2155", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 175, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1336", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_2156", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 176, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2155", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1535", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 177, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1535", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_complex_124", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 178, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_39", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "_conj_60", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 179, - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_60", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "clone_310", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 180, - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_124", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_310", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "mul_844", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 181, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2156", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1536", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 182, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1536", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_complex_125", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 183, - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_39", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "_conj_61", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 184, - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_61", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "clone_311", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 185, - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_125", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_311", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "mul_845", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 186, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_844", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_real_124", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 187, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_124", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1537", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 188, - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1537", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_2157", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 189, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_845", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_as_real_125", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 190, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_125", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1538", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 191, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1538", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "convert_element_type_2158", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 192, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_60", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1539", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 193, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2157", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1540", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 194, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2158", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "view_1541", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 195, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1539", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_1398", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "cluster_root": "einsum_default_669", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1398", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_35", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "einsum_default_655", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 196, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_38", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "permute_1339", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 197, - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1398", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1339", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "einsum_default_656", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "cluster_root": "permute_1372", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_655", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "permute_1340", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "cluster_root": "dtype_cast_577", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1340", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "dtype_cast_568", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "cluster_root": "alias_default_1424", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_568", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wv", - "name": "alias_default_1433", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 198, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1540", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_1399", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "cluster_root": "einsum_default_671", - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1399", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_35", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "einsum_default_657", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 199, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_37", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "permute_1343", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 200, - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1399", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1343", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "einsum_default_658", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 201, - "cluster_root": "add_165", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_656", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_658", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "add_375", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "cluster_root": "permute_1376", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_657", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "permute_1344", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "cluster_root": "dtype_cast_578", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1344", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "dtype_cast_569", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "cluster_root": "alias_default_1423", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_569", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wk", - "name": "alias_default_1432", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 202, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1541", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention", - "name": "alias_default_1400", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "cluster_root": "einsum_default_673", - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1400", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_35", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "einsum_default_659", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 203, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_36", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "permute_1347", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 204, - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1400", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1347", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "einsum_default_660", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 205, - "cluster_root": "add_166", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_375", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_660", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1", - "name": "add_376", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "cluster_root": "permute_1380", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_659", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "permute_1348", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "cluster_root": "dtype_cast_579", - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1348", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "dtype_cast_570", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "cluster_root": "alias_default_1422", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_570", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention.wq", - "name": "alias_default_1431", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 206, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_376", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_2171", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 207, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_2172", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 208, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_32", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_2173", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 209, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2171", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_1401", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 210, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1401", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2173", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_846", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 211, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2172", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_847", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 212, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_846", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_1402", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 213, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_847", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_1403", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 214, - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1403", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1402", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_848", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 215, - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_848", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "sum_187", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 216, - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1403", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "div_94", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 217, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_94", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_187", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_849", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 218, - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1402", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_849", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "sub_93", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 219, - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_93", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_34", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_850", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "cluster_root": "mul_871", - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1401", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1403", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "mul_851", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "cluster_root": "sum_194", - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_851", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "sum_188", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 220, - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_850", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_2174", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "cluster_root": "convert_element_type_2220", - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_188", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "convert_element_type_2175", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 221, - "cluster_root": "add_167", - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1397", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2174", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "add_377", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "cluster_root": "dtype_cast_580", - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2175", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "dtype_cast_571", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "cluster_root": "alias_default_1429", - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_571", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.1.attention_norm", - "name": "alias_default_1438", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 222, - "cluster_root": "alias_default_924", - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)R", - "name": "add_377", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "alias_default_1404", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)R", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 109, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1404", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_29", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "einsum_default_661", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "alias_default_30", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "permute_1351", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_1404", - "src_placement": "S(0)R", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "permute_1351", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "einsum_default_662", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 110, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "einsum_default_661", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "permute_1352", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 111, - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "permute_1352", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "dtype_cast_572", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 112, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(1)", - "name": "dtype_cast_572", - "src_placement": "P(sum)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "alias_default_1427", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_662", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w2", - "name": "alias_default_1405", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1405", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_26", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_852", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1405", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_28", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_853", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_852", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_1406", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 223, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1406", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_22", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "einsum_default_663", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_27", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "permute_1355", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1406", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1355", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "einsum_default_664", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 224, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_663", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "permute_1356", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 225, - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1356", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "dtype_cast_573", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 226, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_573", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w3", - "name": "alias_default_1428", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_853", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "convert_element_type_2184", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 136.64587220149252, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_24", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "convert_element_type_2185", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2185", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_1407", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1407", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "neg_63", - "op": "aten.neg.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "neg_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "exp_63", - "op": "aten.exp.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "exp_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "add_378", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_378", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "reciprocal_31", - "op": "aten.reciprocal.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "reciprocal_31", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_854", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_854", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_1408", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2184", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1408", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_855", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1408", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "sub_94", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1407", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sub_94", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_856", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 182.1944962686567, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_856", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "add_379", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 273.29174440298505, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_855", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "add_379", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "mul_857", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 136.64587220149252, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_857", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "convert_element_type_2186", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2186", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward", - "name": "alias_default_1409", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 227, - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1409", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_22", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "einsum_default_665", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 14336 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_23", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "permute_1359", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 694.8379851971689, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1409", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RS(0)", - "name": "permute_1359", - "src_placement": "RS(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "einsum_default_666", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 156.16671108742005, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_664", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)P(sum)", - "name": "einsum_default_666", - "src_placement": "S(0)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0", - "name": "add_380", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)P(sum)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 228, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_665", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "permute_1360", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 355 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 229, - "compute_cost": 34.16146805037313, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1360", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "dtype_cast_574", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 230, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 487.952, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_574", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.feed_forward.w1", - "name": "alias_default_1426", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 430.3685785129651, - "dst_placement": "S(0)S(1)", - "name": "add_380", - "src_placement": "S(0)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_2191", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_18", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_2192", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_19", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_2193", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2191", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_1410", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1410", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2193", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_858", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2192", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_859", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_858", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_1411", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_859", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_1412", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1412", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1411", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_860", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_860", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "sum_189", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1412", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "div_95", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_95", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_189", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_861", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1411", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_861", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "sub_95", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_95", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_21", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_862", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 231, - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1410", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1412", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "mul_863", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 232, - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_863", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "sum_190", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_862", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_2194", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 233, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_190", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "convert_element_type_2195", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1404", - "src_placement": "S(0)R", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2194", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "add_381", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 1.0 - }, - { - "cluster_id": 234, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2195", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "dtype_cast_575", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 235, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_575", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.ffn_norm", - "name": "alias_default_1430", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "alias_default_1413", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 236, - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1413", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_16", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "einsum_default_667", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_17", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "permute_1363", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1413", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1363", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "einsum_default_668", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 237, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "einsum_default_667", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "permute_1364", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return self.wo(output)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 316 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 238, - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "permute_1364", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "dtype_cast_576", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 239, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 331.9007188940092, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_576", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wo", - "name": "alias_default_1425", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(2)", - "name": "einsum_default_668", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1556", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "output = output.view(bs, seqlen, -1)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 315 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1556", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_1365", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "output = output.transpose(", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 312 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 1985.2513862776257, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "permute_1365", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_12", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_13", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_14", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_15", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_1", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_6", - "src_placement": "RR", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "getitem_7", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "_scaled_dot_product_flash_attention_backward_31", - "op": "aten._scaled_dot_product_flash_attention_backward.default", - "phase": "backward", - "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem_381", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem_382", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "_scaled_dot_product_flash_attention_backward_31", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.sdpa", - "name": "getitem_383", - "op": "", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 32, - 8192, - 128 - ], - "source": { - "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 53 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_383", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_1366", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 308 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_1367", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 307 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "getitem_381", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "permute_1368", - "op": "aten.permute.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 306 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1366", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1557", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1557", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "sum_191", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_191", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "squeeze_62", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1367", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1558", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 4, - 128 - ], - "source": { - "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 223 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 16.26736573827292, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1558", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "sum_192", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 1, - 128 - ], - "source": { - "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 222 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "sum_192", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "squeeze_63", - "op": "aten.squeeze.dim", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "torch.unsqueeze(x, dim=3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "repeat_kv", - "line": 221 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_63", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_2200", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "permute_1368", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_2201", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "return xq_out.type_as(xq), xk_out.type_as(xk)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 212 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2200", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1559", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1559", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_complex_126", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "_conj_62", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_62", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "clone_318", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 14.64062916444563, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_126", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_318", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "mul_864", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64 - ], - "source": { - "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 211 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2201", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1560", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1560", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_complex_127", - "op": "aten.view_as_complex.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_11", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "_conj_63", - "op": "aten._conj.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "_conj_63", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "clone_319", - "op": "aten.clone.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1, - 8192, - 1, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 53.68230693630064, - "dtype": "complex64", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_complex_127", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "clone_319", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "mul_865", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64 - ], - "source": { - "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 210 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_864", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_real_126", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 64, - 2 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_126", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1561", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 9.760419442963753, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1561", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_2202", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 8, - 128 - ], - "source": { - "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 208 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "mul_865", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_as_real_127", - "op": "aten.view_as_real.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 64, - 2 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_as_real_127", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1562", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1562", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "convert_element_type_2203", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 32, - 128 - ], - "source": { - "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "apply_rotary_emb", - "line": 207 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "squeeze_62", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1563", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2202", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1564", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "convert_element_type_2203", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "view_1565", - "op": "aten.view.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1563", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_1414", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 297 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 240, - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1414", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_7", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "einsum_default_669", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_10", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "permute_1371", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1414", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 36.328589861751155, - "dst_placement": "RR", - "name": "permute_1371", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "einsum_default_670", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "cluster_id": 241, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_669", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "permute_1372", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 242, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1372", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "dtype_cast_577", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 243, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_577", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wv", - "name": "alias_default_1424", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1564", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_1415", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 1024 - ], - "source": { - "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 296 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 244, - "compute_cost": 56.12241179704158, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1415", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_7", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "einsum_default_671", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 1024 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_9", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "permute_1375", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 49.631284656940636, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 86.07528421052632, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1415", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "permute_1375", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "einsum_default_672", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_670", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_672", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "add_382", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 245, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_671", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "permute_1376", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 246, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1376", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "dtype_cast_578", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 247, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 57.40529711375213, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_578", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wk", - "name": "alias_default_1423", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "view_1565", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention", - "name": "alias_default_1416", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(2)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 295 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 248, - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(2)", - "name": "alias_default_1416", - "src_placement": "S(0)S(2)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)R", - "name": "alias_default_7", - "src_placement": "S(0)R", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "einsum_default_673", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "P(sum)S(1)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RS(1)", - "name": "alias_default_8", - "src_placement": "RS(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "permute_1379", - "op": "aten.permute.default", - "phase": "backward", - "placement": "RS(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 198.52513862776254, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 190.35670720457864, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1416", - "src_placement": "S(0)S(2)", - "transition_cost": 1 - }, - { - "comm_cost": 94.3143594470046, - "dst_placement": "RR", - "name": "permute_1379", - "src_placement": "RS(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "einsum_default_674", - "op": "aten.einsum.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 2.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_382", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "einsum_default_674", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0", - "name": "add_383", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 249, - "compute_cost": 0.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(1)", - "name": "einsum_default_673", - "src_placement": "P(sum)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "permute_1380", - "op": "aten.permute.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 290 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 250, - "compute_cost": 9.760419442963753, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)S(0)", - "name": "permute_1380", - "src_placement": "P(sum)S(0)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "dtype_cast_579", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 251, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 160.272, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_579", - "src_placement": "P(sum)S(0)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention.wq", - "name": "alias_default_1422", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "add_383", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type_2216", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_3", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type_2217", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_4", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type_2218", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "RR", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2216", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_1417", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1417", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "convert_element_type_2218", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_866", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2217", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_867", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_866", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_1418", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_867", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_1419", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1419", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1418", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_868", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_868", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "sum_193", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 1 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.05557036247335, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1419", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "div_96", - "op": "aten.div.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "div_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sum_193", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_869", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1418", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_869", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "sub_96", - "op": "aten.sub.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 52.06192480221486, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "sub_96", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_6", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_870", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 252, - "compute_cost": 78.08335554371003, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1417", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1419", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "mul_871", - "op": "aten.mul.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 253, - "compute_cost": 26.034139620978188, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_871", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "sum_194", - "op": "aten.sum.dim_IntList", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "mul_870", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type_2219", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 254, - "compute_cost": 7.0, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "sum_194", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "convert_element_type_2220", - "op": "prims.convert_element_type.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 39.041677771855014, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "alias_default_1413", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - }, - { - "comm_cost": 0.0, - "dst_placement": "S(0)S(1)", - "name": "convert_element_type_2219", - "src_placement": "S(0)S(1)", - "transition_cost": 0 - } - ], - "name": "add_384", - "op": "aten.add.Tensor", - "phase": "backward", - "placement": "S(0)S(1)", - "shape": [ - 8, - 8192, - 4096 - ], - "source": { - "code": "return torch.rms_norm(input, normalized_shape, weight, eps)", - "file": "/data/users/wangkj/pytorch/torch/nn/functional.py", - "func": "rms_norm", - "line": 2964 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 255, - "compute_cost": 7.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "P(sum)P(sum)", - "name": "convert_element_type_2220", - "src_placement": "P(sum)P(sum)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "dtype_cast_580", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "P(sum)P(sum)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "cluster_id": 256, - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 28.41652038284296, - "dst_placement": "S(0)S(0)", - "name": "dtype_cast_580", - "src_placement": "P(sum)P(sum)", - "transition_cost": 1 - } - ], - "module_path": "L['self'].layers.0.attention_norm", - "name": "alias_default_1429", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(0)S(0)", - "shape": [ - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 38.685829146330285, - "dtype": "bfloat16", - "inputs": [ - { - "comm_cost": 706.2108351658422, - "dst_placement": "S(2)S(2)", - "name": "add_384", - "src_placement": "S(0)S(1)", - "transition_cost": 1 - }, - { - "comm_cost": 0.0, - "dst_placement": "RR", - "name": "alias_default_1", - "src_placement": "RR", - "transition_cost": 0 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "embedding_dense_backward", - "op": "aten.embedding_dense_backward.default", - "phase": "backward", - "placement": "S(1)S(1)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py", - "func": "forward", - "line": 539 - }, - "transition_cost": 1.0 - }, - { - "compute_cost": 76.40578345195063, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(1)S(1)", - "name": "embedding_dense_backward", - "src_placement": "S(1)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "dtype_cast_581", - "op": "autoparallel.dtype_cast.default", - "phase": "backward", - "placement": "S(1)S(1)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "compute_cost": 0.0, - "dtype": "float32", - "inputs": [ - { - "comm_cost": 0.0, - "dst_placement": "S(1)S(1)", - "name": "dtype_cast_581", - "src_placement": "S(1)S(1)", - "transition_cost": 0 - } - ], - "module_path": "L['self'].tok_embeddings", - "name": "alias_default_1421", - "op": "aten.alias.default", - "phase": "backward", - "placement": "S(1)S(1)", - "shape": [ - 128256, - 4096 - ], - "source": { - "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)", - "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py", - "func": "getter", - "line": 25 - }, - "transition_cost": 0.0 - }, - { - "inputs": [ - { - "name": "alias_default_1420" - }, - { - "name": "alias_default_1421" - }, - { - "name": "alias_default_1422" - }, - { - "name": "alias_default_1423" - }, - { - "name": "alias_default_1424" - }, - { - "name": "alias_default_1425" - }, - { - "name": "alias_default_1426" - }, - { - "name": "alias_default_1427" - }, - { - "name": "alias_default_1428" - }, - { - "name": "alias_default_1429" - }, - { - "name": "alias_default_1430" - }, - { - "name": "alias_default_1431" - }, - { - "name": "alias_default_1432" - }, - { - "name": "alias_default_1433" - }, - { - "name": "alias_default_1434" - }, - { - "name": "alias_default_1435" - }, - { - "name": "alias_default_1436" - }, - { - "name": "alias_default_1437" - }, - { - "name": "alias_default_1438" - }, - { - "name": "alias_default_1439" - }, - { - "name": "alias_default_1440" - }, - { - "name": "alias_default_1441" - }, - { - "name": "alias_default_1442" - }, - { - "name": "alias_default_1443" - }, - { - "name": "alias_default_1444" - }, - { - "name": "alias_default_1445" - }, - { - "name": "alias_default_1446" - }, - { - "name": "alias_default_1447" - }, - { - "name": "alias_default_1448" - }, - { - "name": "alias_default_1449" - }, - { - "name": "alias_default_1450" - }, - { - "name": "alias_default_1451" - }, - { - "name": "alias_default_1452" - }, - { - "name": "alias_default_1453" - }, - { - "name": "alias_default_1454" - }, - { - "name": "alias_default_1455" - }, - { - "name": "alias_default_1456" - }, - { - "name": "alias_default_1457" - }, - { - "name": "alias_default_1458" - }, - { - "name": "alias_default_1459" - }, - { - "name": "alias_default_1460" - }, - { - "name": "alias_default_1461" - }, - { - "name": "alias_default_1462" - }, - { - "name": "alias_default_1463" - }, - { - "name": "alias_default_1464" - }, - { - "name": "alias_default_1465" - }, - { - "name": "alias_default_1466" - }, - { - "name": "alias_default_1467" - }, - { - "name": "alias_default_1468" - }, - { - "name": "alias_default_1469" - }, - { - "name": "alias_default_1470" - }, - { - "name": "alias_default_1471" - }, - { - "name": "alias_default_1472" - }, - { - "name": "alias_default_1473" - }, - { - "name": "alias_default_1474" - }, - { - "name": "alias_default_1475" - }, - { - "name": "alias_default_1476" - }, - { - "name": "alias_default_1477" - }, - { - "name": "alias_default_1478" - }, - { - "name": "alias_default_1479" - }, - { - "name": "alias_default_1480" - }, - { - "name": "alias_default_1481" - }, - { - "name": "alias_default_1482" - }, - { - "name": "alias_default_1483" - }, - { - "name": "alias_default_1484" - }, - { - "name": "alias_default_1485" - }, - { - "name": "alias_default_1486" - }, - { - "name": "alias_default_1487" - }, - { - "name": "alias_default_1488" - }, - { - "name": "alias_default_1489" - }, - { - "name": "alias_default_1490" - }, - { - "name": "alias_default_1491" - }, - { - "name": "alias_default_1492" - }, - { - "name": "alias_default_1493" - }, - { - "name": "alias_default_1494" - }, - { - "name": "alias_default_1495" - }, - { - "name": "alias_default_1496" - }, - { - "name": "alias_default_1497" - }, - { - "name": "alias_default_1498" - }, - { - "name": "alias_default_1499" - }, - { - "name": "alias_default_1500" - }, - { - "name": "alias_default_1501" - }, - { - "name": "alias_default_1502" - }, - { - "name": "alias_default_1503" - }, - { - "name": "alias_default_1504" - }, - { - "name": "alias_default_1505" - }, - { - "name": "alias_default_1506" - }, - { - "name": "alias_default_1507" - }, - { - "name": "alias_default_1508" - }, - { - "name": "alias_default_1509" - }, - { - "name": "alias_default_1510" - }, - { - "name": "alias_default_1511" - }, - { - "name": "alias_default_1512" - }, - { - "name": "alias_default_1513" - }, - { - "name": "alias_default_1514" - }, - { - "name": "alias_default_1515" - }, - { - "name": "alias_default_1516" - }, - { - "name": "alias_default_1517" - }, - { - "name": "alias_default_1518" - }, - { - "name": "alias_default_1519" - }, - { - "name": "alias_default_1520" - }, - { - "name": "alias_default_1521" - }, - { - "name": "alias_default_1522" - }, - { - "name": "alias_default_1523" - }, - { - "name": "alias_default_1524" - }, - { - "name": "alias_default_1525" - }, - { - "name": "alias_default_1526" - }, - { - "name": "alias_default_1527" - }, - { - "name": "alias_default_1528" - }, - { - "name": "alias_default_1529" - }, - { - "name": "alias_default_1530" - }, - { - "name": "alias_default_1531" - }, - { - "name": "alias_default_1532" - }, - { - "name": "alias_default_1533" - }, - { - "name": "alias_default_1534" - }, - { - "name": "alias_default_1535" - }, - { - "name": "alias_default_1536" - }, - { - "name": "alias_default_1537" - }, - { - "name": "alias_default_1538" - }, - { - "name": "alias_default_1539" - }, - { - "name": "alias_default_1540" - }, - { - "name": "alias_default_1541" - }, - { - "name": "alias_default_1542" - }, - { - "name": "alias_default_1543" - }, - { - "name": "alias_default_1544" - }, - { - "name": "alias_default_1545" - }, - { - "name": "alias_default_1546" - }, - { - "name": "alias_default_1547" - }, - { - "name": "alias_default_1548" - }, - { - "name": "alias_default_1549" - }, - { - "name": "alias_default_1550" - }, - { - "name": "alias_default_1551" - }, - { - "name": "alias_default_1552" - }, - { - "name": "alias_default_1553" - }, - { - "name": "alias_default_1554" - }, - { - "name": "alias_default_1555" - }, - { - "name": "alias_default_1556" - }, - { - "name": "alias_default_1557" - }, - { - "name": "alias_default_1558" - }, - { - "name": "alias_default_1559" - }, - { - "name": "alias_default_1560" - }, - { - "name": "alias_default_1561" - }, - { - "name": "alias_default_1562" - }, - { - "name": "alias_default_1563" - }, - { - "name": "alias_default_1564" - }, - { - "name": "alias_default_1565" - }, - { - "name": "alias_default_1566" - }, - { - "name": "alias_default_1567" - }, - { - "name": "alias_default_1568" - }, - { - "name": "alias_default_1569" - }, - { - "name": "alias_default_1570" - }, - { - "name": "alias_default_1571" - }, - { - "name": "alias_default_1572" - }, - { - "name": "alias_default_1573" - }, - { - "name": "alias_default_1574" - }, - { - "name": "alias_default_1575" - }, - { - "name": "alias_default_1576" - }, - { - "name": "alias_default_1577" - }, - { - "name": "alias_default_1578" - }, - { - "name": "alias_default_1579" - }, - { - "name": "alias_default_1580" - }, - { - "name": "alias_default_1581" - }, - { - "name": "alias_default_1582" - }, - { - "name": "alias_default_1583" - }, - { - "name": "alias_default_1584" - }, - { - "name": "alias_default_1585" - }, - { - "name": "alias_default_1586" - }, - { - "name": "alias_default_1587" - }, - { - "name": "alias_default_1588" - }, - { - "name": "alias_default_1589" - }, - { - "name": "alias_default_1590" - }, - { - "name": "alias_default_1591" - }, - { - "name": "alias_default_1592" - }, - { - "name": "alias_default_1593" - }, - { - "name": "alias_default_1594" - }, - { - "name": "alias_default_1595" - }, - { - "name": "alias_default_1596" - }, - { - "name": "alias_default_1597" - }, - { - "name": "alias_default_1598" - }, - { - "name": "alias_default_1599" - }, - { - "name": "alias_default_1600" - }, - { - "name": "alias_default_1601" - }, - { - "name": "alias_default_1602" - }, - { - "name": "alias_default_1603" - }, - { - "name": "alias_default_1604" - }, - { - "name": "alias_default_1605" - }, - { - "name": "alias_default_1606" - }, - { - "name": "alias_default_1607" - }, - { - "name": "alias_default_1608" - }, - { - "name": "alias_default_1609" - }, - { - "name": "alias_default_1610" - }, - { - "name": "alias_default_1611" - }, - { - "name": "alias_default_1612" - }, - { - "name": "alias_default_1613" - }, - { - "name": "alias_default_1614" - }, - { - "name": "alias_default_1615" - }, - { - "name": "alias_default_1616" - }, - { - "name": "alias_default_1617" - }, - { - "name": "alias_default_1618" - }, - { - "name": "alias_default_1619" - }, - { - "name": "alias_default_1620" - }, - { - "name": "alias_default_1621" - }, - { - "name": "alias_default_1622" - }, - { - "name": "alias_default_1623" - }, - { - "name": "alias_default_1624" - }, - { - "name": "alias_default_1625" - }, - { - "name": "alias_default_1626" - }, - { - "name": "alias_default_1627" - }, - { - "name": "alias_default_1628" - }, - { - "name": "alias_default_1629" - }, - { - "name": "alias_default_1630" - }, - { - "name": "alias_default_1631" - }, - { - "name": "alias_default_1632" - }, - { - "name": "alias_default_1633" - }, - { - "name": "alias_default_1634" - }, - { - "name": "alias_default_1635" - }, - { - "name": "alias_default_1636" - }, - { - "name": "alias_default_1637" - }, - { - "name": "alias_default_1638" - }, - { - "name": "alias_default_1639" - }, - { - "name": "alias_default_1640" - }, - { - "name": "alias_default_1641" - }, - { - "name": "alias_default_1642" - }, - { - "name": "alias_default_1643" - }, - { - "name": "alias_default_1644" - }, - { - "name": "alias_default_1645" - }, - { - "name": "alias_default_1646" - }, - { - "name": "alias_default_1647" - }, - { - "name": "alias_default_1648" - }, - { - "name": "alias_default_1649" - }, - { - "name": "alias_default_1650" - }, - { - "name": "alias_default_1651" - }, - { - "name": "alias_default_1652" - }, - { - "name": "alias_default_1653" - }, - { - "name": "alias_default_1654" - }, - { - "name": "alias_default_1655" - }, - { - "name": "alias_default_1656" - }, - { - "name": "alias_default_1657" - }, - { - "name": "alias_default_1658" - }, - { - "name": "alias_default_1659" - }, - { - "name": "alias_default_1660" - }, - { - "name": "alias_default_1661" - }, - { - "name": "alias_default_1662" - }, - { - "name": "alias_default_1663" - }, - { - "name": "alias_default_1664" - }, - { - "name": "alias_default_1665" - }, - { - "name": "alias_default_1666" - }, - { - "name": "alias_default_1667" - }, - { - "name": "alias_default_1668" - }, - { - "name": "alias_default_1669" - }, - { - "name": "alias_default_1670" - }, - { - "name": "alias_default_1671" - }, - { - "name": "alias_default_1672" - }, - { - "name": "alias_default_1673" - }, - { - "name": "alias_default_1674" - }, - { - "name": "alias_default_1675" - }, - { - "name": "alias_default_1676" - }, - { - "name": "alias_default_1677" - }, - { - "name": "alias_default_1678" - }, - { - "name": "alias_default_1679" - }, - { - "name": "alias_default_1680" - }, - { - "name": "alias_default_1681" - }, - { - "name": "alias_default_1682" - }, - { - "name": "alias_default_1683" - }, - { - "name": "alias_default_1684" - }, - { - "name": "alias_default_1685" - }, - { - "name": "alias_default_1686" - }, - { - "name": "alias_default_1687" - }, - { - "name": "alias_default_1688" - }, - { - "name": "alias_default_1689" - }, - { - "name": "alias_default_1690" - }, - { - "name": "alias_default_1691" - }, - { - "name": "alias_default_1692" - }, - { - "name": "alias_default_1693" - }, - { - "name": "alias_default_1694" - }, - { - "name": "alias_default_1695" - }, - { - "name": "alias_default_1696" - }, - { - "name": "alias_default_1697" - }, - { - "name": "alias_default_1698" - }, - { - "name": "alias_default_1699" - }, - { - "name": "alias_default_1700" - }, - { - "name": "alias_default_1701" - }, - { - "name": "alias_default_1702" - }, - { - "name": "alias_default_1703" - }, - { - "name": "alias_default_1704" - }, - { - "name": "alias_default_1705" - }, - { - "name": "alias_default_1706" - }, - { - "name": "alias_default_1707" - }, - { - "name": "alias_default_1708" - }, - { - "name": "alias_default_1709" - }, - { - "name": "alias_default_1710" - }, - { - "name": "alias_default_1711" - } - ], - "name": "output", - "op": "output" - } - ], - "summary": { - "comm": 212780.17498325979, - "compute": 581120.8234224034, - "total": 794933.9984056632, - "transition": 1033.0 - } -} \ No newline at end of file diff --git a/profile_results/llama3_8b_4x4_strategy_summary.json b/profile_results/llama3_8b_4x4_strategy_summary.json deleted file mode 100644 index ccdeb4d9..00000000 --- a/profile_results/llama3_8b_4x4_strategy_summary.json +++ /dev/null @@ -1,2054 +0,0 @@ -{ - "config": { - "batch_size": 8, - "input_constraint": "Shard(0), Replicate()", - "mesh_dim_names": [ - "dp", - "tp" - ], - "mesh_shape": [ - 4, - 4 - ], - "model": "autoparallel._testing.models.llama3 Transformer 8B config", - "output_constraint": "Shard(0), Shard(2)", - "seqlen": 8192, - "vocab_size": 128256, - "world_size": 16 - }, - "elapsed_s": 115.23945621983148, - "json_summary": { - "comm": 212780.17498325979, - "compute": 581120.8234224034, - "total": 794933.9984056632, - "transition": 1033.0 - }, - "optimizer_profile": { - "ilp": { - "cluster_copied_decision_variables": 8181840, - "constraints": 175408, - "logical_decision_variables": 8657526, - "unique_variables": 475686 - }, - "last_solve": { - "constraints": 175412, - "extract_s": 0.044429945992305875, - "kind": "solve", - "objective": 794933.998405679, - "objective_s": 3.8023465629667044, - "pipeline_total_s": 102.16174313612282, - "solve_s": 59.80278266593814, - "status": "Optimal", - "total_s": 63.73084603413008, - "unique_variables": 475686 - }, - "mesh": { - "dim_names": [ - "dp", - "tp" - ], - "ndim": 2, - "shape": [ - 4, - 4 - ], - "size": 16 - }, - "model": { - "graph_nodes": 8668, - "op_counts": { - "call_function": 8373, - "output": 1, - "placeholder": 294 - }, - "parameter_bytes": 32121044992, - "parameter_nodes": 291, - "parameter_numel": 8030261248, - "tensor_nodes": 8667, - "unknown_parameter_nodes": 0 - }, - "strategies": { - "max_strategies_per_node": 81, - "nodes": 8668, - "option_tuples": 8657526, - "strategy_options": 220687 - }, - "timings": { - "compute_cost_estimation_s": 1.9735342266503721, - "constraint_construction_s": 3.2506618059705943, - "cost_estimation_s": 4.9254587206523865, - "decision_var_build_s": 15.363263476872817, - "decision_var_overhead_s": 6.9146421970799565, - "edge_cost_estimation_s": 2.9519244940020144, - "ilp_construction_s": 13.688466562191024, - "init_total_s": 38.43089710199274, - "pulp_var_creation_s": 3.5231625591404736, - "strategy_enumeration_s": 10.847158421995118, - "validation_s": 0.060926787089556456 - } - }, - "param_strategy_groups": { - "layers.*.attention.wk.weight": { - "S(0)S(0)": 32 - }, - "layers.*.attention.wo.weight": { - "S(0)S(0)": 32 - }, - "layers.*.attention.wq.weight": { - "S(0)S(0)": 32 - }, - "layers.*.attention.wv.weight": { - "S(0)S(0)": 32 - }, - "layers.*.attention_norm.weight": { - "S(0)S(0)": 32 - }, - "layers.*.feed_forward.w1.weight": { - "S(0)S(0)": 32 - }, - "layers.*.feed_forward.w2.weight": { - "S(0)S(1)": 32 - }, - "layers.*.feed_forward.w3.weight": { - "S(0)S(0)": 32 - }, - "layers.*.ffn_norm.weight": { - "S(0)S(0)": 32 - }, - "norm.weight": { - "S(0)S(0)": 1 - }, - "output.weight": { - "S(0)S(0)": 1 - }, - "tok_embeddings.weight": { - "S(1)S(1)": 1 - } - }, - "phase_placement_counts": { - "backward": [ - [ - "S(0)S(2)", - 1634 - ], - [ - "S(0)S(1)", - 1423 - ], - [ - "P(sum)S(0)", - 354 - ], - [ - "P(sum)P(sum)", - 291 - ], - [ - "S(0)S(0)", - 258 - ], - [ - "RR", - 257 - ], - [ - "P(sum)S(1)", - 225 - ], - [ - "RS(0)", - 129 - ], - [ - "S(0)P(sum)", - 97 - ], - [ - "S(0)R", - 32 - ], - [ - "RS(1)", - 32 - ], - [ - "(S(0)S(1), S(0)S(1), S(0)S(1))", - 32 - ], - [ - "S(1)S(1)", - 3 - ] - ], - "forward": [ - [ - "S(0)S(2)", - 1378 - ], - [ - "S(0)S(1)", - 1227 - ], - [ - "S(0)S(0)", - 516 - ], - [ - "RR", - 324 - ], - [ - "RS(1)", - 258 - ], - [ - "S(0)R", - 66 - ], - [ - "RS(0)", - 64 - ], - [ - "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - 32 - ], - [ - "S(0)P(sum)", - 32 - ], - [ - "S(1)S(1)", - 2 - ], - [ - "S(2)S(2)", - 1 - ] - ] - }, - "placement_counts": [ - [ - "S(0)S(2)", - 3012 - ], - [ - "S(0)S(1)", - 2650 - ], - [ - "S(0)S(0)", - 774 - ], - [ - "RR", - 581 - ], - [ - "P(sum)S(0)", - 354 - ], - [ - "P(sum)P(sum)", - 291 - ], - [ - "RS(1)", - 290 - ], - [ - "P(sum)S(1)", - 225 - ], - [ - "RS(0)", - 193 - ], - [ - "S(0)P(sum)", - 129 - ], - [ - "S(0)R", - 98 - ], - [ - "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)", - 32 - ], - [ - "(S(0)S(1), S(0)S(1), S(0)S(1))", - 32 - ], - [ - "S(1)S(1)", - 5 - ], - [ - "S(2)S(2)", - 1 - ] - ], - "sample_forward_interesting_nodes": [ - { - "inputs": [], - "module_path": "layers.0.attention.wq.weight", - "name": "primals_2", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.0.attention.wk.weight", - "name": "primals_3", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.0.attention.wv.weight", - "name": "primals_4", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.0.attention.wo.weight", - "name": "primals_5", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.0.feed_forward.w1.weight", - "name": "primals_6", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.0.feed_forward.w2.weight", - "name": "primals_7", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.0.feed_forward.w3.weight", - "name": "primals_8", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.1.attention.wq.weight", - "name": "primals_11", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.1.attention.wk.weight", - "name": "primals_12", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.1.attention.wv.weight", - "name": "primals_13", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.1.attention.wo.weight", - "name": "primals_14", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.1.feed_forward.w1.weight", - "name": "primals_15", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.1.feed_forward.w2.weight", - "name": "primals_16", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.1.feed_forward.w3.weight", - "name": "primals_17", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.2.attention.wq.weight", - "name": "primals_20", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.2.attention.wk.weight", - "name": "primals_21", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.2.attention.wv.weight", - "name": "primals_22", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.2.attention.wo.weight", - "name": "primals_23", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.2.feed_forward.w1.weight", - "name": "primals_24", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.2.feed_forward.w2.weight", - "name": "primals_25", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.2.feed_forward.w3.weight", - "name": "primals_26", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.3.attention.wq.weight", - "name": "primals_29", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.3.attention.wk.weight", - "name": "primals_30", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.3.attention.wv.weight", - "name": "primals_31", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.3.attention.wo.weight", - "name": "primals_32", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.3.feed_forward.w1.weight", - "name": "primals_33", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.3.feed_forward.w2.weight", - "name": "primals_34", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.3.feed_forward.w3.weight", - "name": "primals_35", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.4.attention.wq.weight", - "name": "primals_38", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.4.attention.wk.weight", - "name": "primals_39", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.4.attention.wv.weight", - "name": "primals_40", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.4.attention.wo.weight", - "name": "primals_41", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.4.feed_forward.w1.weight", - "name": "primals_42", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.4.feed_forward.w2.weight", - "name": "primals_43", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.4.feed_forward.w3.weight", - "name": "primals_44", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.5.attention.wq.weight", - "name": "primals_47", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.5.attention.wk.weight", - "name": "primals_48", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.5.attention.wv.weight", - "name": "primals_49", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.5.attention.wo.weight", - "name": "primals_50", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.5.feed_forward.w1.weight", - "name": "primals_51", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.5.feed_forward.w2.weight", - "name": "primals_52", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.5.feed_forward.w3.weight", - "name": "primals_53", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.6.attention.wq.weight", - "name": "primals_56", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.6.attention.wk.weight", - "name": "primals_57", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.6.attention.wv.weight", - "name": "primals_58", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.6.attention.wo.weight", - "name": "primals_59", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.6.feed_forward.w1.weight", - "name": "primals_60", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.6.feed_forward.w2.weight", - "name": "primals_61", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.6.feed_forward.w3.weight", - "name": "primals_62", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.7.attention.wq.weight", - "name": "primals_65", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.7.attention.wk.weight", - "name": "primals_66", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.7.attention.wv.weight", - "name": "primals_67", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.7.attention.wo.weight", - "name": "primals_68", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.7.feed_forward.w1.weight", - "name": "primals_69", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.7.feed_forward.w2.weight", - "name": "primals_70", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.7.feed_forward.w3.weight", - "name": "primals_71", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.8.attention.wq.weight", - "name": "primals_74", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.8.attention.wk.weight", - "name": "primals_75", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.8.attention.wv.weight", - "name": "primals_76", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.8.attention.wo.weight", - "name": "primals_77", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.8.feed_forward.w1.weight", - "name": "primals_78", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.8.feed_forward.w2.weight", - "name": "primals_79", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.8.feed_forward.w3.weight", - "name": "primals_80", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.9.attention.wq.weight", - "name": "primals_83", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.9.attention.wk.weight", - "name": "primals_84", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.9.attention.wv.weight", - "name": "primals_85", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.9.attention.wo.weight", - "name": "primals_86", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.9.feed_forward.w1.weight", - "name": "primals_87", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.9.feed_forward.w2.weight", - "name": "primals_88", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.9.feed_forward.w3.weight", - "name": "primals_89", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.10.attention.wq.weight", - "name": "primals_92", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.10.attention.wk.weight", - "name": "primals_93", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.10.attention.wv.weight", - "name": "primals_94", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.10.attention.wo.weight", - "name": "primals_95", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.10.feed_forward.w1.weight", - "name": "primals_96", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.10.feed_forward.w2.weight", - "name": "primals_97", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.10.feed_forward.w3.weight", - "name": "primals_98", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.11.attention.wq.weight", - "name": "primals_101", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.11.attention.wk.weight", - "name": "primals_102", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.11.attention.wv.weight", - "name": "primals_103", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.11.attention.wo.weight", - "name": "primals_104", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.11.feed_forward.w1.weight", - "name": "primals_105", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.11.feed_forward.w2.weight", - "name": "primals_106", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.11.feed_forward.w3.weight", - "name": "primals_107", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.12.attention.wq.weight", - "name": "primals_110", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.12.attention.wk.weight", - "name": "primals_111", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.12.attention.wv.weight", - "name": "primals_112", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.12.attention.wo.weight", - "name": "primals_113", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.12.feed_forward.w1.weight", - "name": "primals_114", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.12.feed_forward.w2.weight", - "name": "primals_115", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.12.feed_forward.w3.weight", - "name": "primals_116", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.13.attention.wq.weight", - "name": "primals_119", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.13.attention.wk.weight", - "name": "primals_120", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.13.attention.wv.weight", - "name": "primals_121", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.13.attention.wo.weight", - "name": "primals_122", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.13.feed_forward.w1.weight", - "name": "primals_123", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.13.feed_forward.w2.weight", - "name": "primals_124", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.13.feed_forward.w3.weight", - "name": "primals_125", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.14.attention.wq.weight", - "name": "primals_128", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.14.attention.wk.weight", - "name": "primals_129", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.14.attention.wv.weight", - "name": "primals_130", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.14.attention.wo.weight", - "name": "primals_131", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.14.feed_forward.w1.weight", - "name": "primals_132", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.14.feed_forward.w2.weight", - "name": "primals_133", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.14.feed_forward.w3.weight", - "name": "primals_134", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.15.attention.wq.weight", - "name": "primals_137", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.15.attention.wk.weight", - "name": "primals_138", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.15.attention.wv.weight", - "name": "primals_139", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.15.attention.wo.weight", - "name": "primals_140", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.15.feed_forward.w1.weight", - "name": "primals_141", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.15.feed_forward.w2.weight", - "name": "primals_142", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.15.feed_forward.w3.weight", - "name": "primals_143", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.16.attention.wq.weight", - "name": "primals_146", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.16.attention.wk.weight", - "name": "primals_147", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.16.attention.wv.weight", - "name": "primals_148", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.16.attention.wo.weight", - "name": "primals_149", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.16.feed_forward.w1.weight", - "name": "primals_150", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.16.feed_forward.w2.weight", - "name": "primals_151", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.16.feed_forward.w3.weight", - "name": "primals_152", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.17.attention.wq.weight", - "name": "primals_155", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.17.attention.wk.weight", - "name": "primals_156", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.17.attention.wv.weight", - "name": "primals_157", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.17.attention.wo.weight", - "name": "primals_158", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.17.feed_forward.w1.weight", - "name": "primals_159", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.17.feed_forward.w2.weight", - "name": "primals_160", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.17.feed_forward.w3.weight", - "name": "primals_161", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.18.attention.wq.weight", - "name": "primals_164", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.18.attention.wk.weight", - "name": "primals_165", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.18.attention.wv.weight", - "name": "primals_166", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.18.attention.wo.weight", - "name": "primals_167", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.18.feed_forward.w1.weight", - "name": "primals_168", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.18.feed_forward.w2.weight", - "name": "primals_169", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.18.feed_forward.w3.weight", - "name": "primals_170", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.19.attention.wq.weight", - "name": "primals_173", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.19.attention.wk.weight", - "name": "primals_174", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.19.attention.wv.weight", - "name": "primals_175", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.19.attention.wo.weight", - "name": "primals_176", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.19.feed_forward.w1.weight", - "name": "primals_177", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.19.feed_forward.w2.weight", - "name": "primals_178", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.19.feed_forward.w3.weight", - "name": "primals_179", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.20.attention.wq.weight", - "name": "primals_182", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.20.attention.wk.weight", - "name": "primals_183", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.20.attention.wv.weight", - "name": "primals_184", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.20.attention.wo.weight", - "name": "primals_185", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.20.feed_forward.w1.weight", - "name": "primals_186", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.20.feed_forward.w2.weight", - "name": "primals_187", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.20.feed_forward.w3.weight", - "name": "primals_188", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.21.attention.wq.weight", - "name": "primals_191", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.21.attention.wk.weight", - "name": "primals_192", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.21.attention.wv.weight", - "name": "primals_193", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.21.attention.wo.weight", - "name": "primals_194", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.21.feed_forward.w1.weight", - "name": "primals_195", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.21.feed_forward.w2.weight", - "name": "primals_196", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - }, - { - "inputs": [], - "module_path": "layers.21.feed_forward.w3.weight", - "name": "primals_197", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.22.attention.wq.weight", - "name": "primals_200", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.22.attention.wk.weight", - "name": "primals_201", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.22.attention.wv.weight", - "name": "primals_202", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 1024, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.22.attention.wo.weight", - "name": "primals_203", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 4096, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.22.feed_forward.w1.weight", - "name": "primals_204", - "op": "placeholder", - "placement": "S(0)S(0)", - "shape": [ - 14336, - 4096 - ] - }, - { - "inputs": [], - "module_path": "layers.22.feed_forward.w2.weight", - "name": "primals_205", - "op": "placeholder", - "placement": "S(0)S(1)", - "shape": [ - 4096, - 14336 - ] - } - ] -} \ No newline at end of file diff --git a/profile_results/real_llama3_3b_dag_node_stats.csv b/profile_results/real_llama3_3b_dag_node_stats.csv deleted file mode 100644 index 5f813f1b..00000000 --- a/profile_results/real_llama3_3b_dag_node_stats.csv +++ /dev/null @@ -1,7200 +0,0 @@ -idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count -0,primals_1,placeholder,primals_1,unknown,,0,0,1,0,5816,3 -1,primals_2,placeholder,primals_2,unknown,,0,0,1,0,5777,3 -2,primals_3,placeholder,primals_3,unknown,,0,0,1,0,5777,3 -3,primals_4,placeholder,primals_4,unknown,,0,0,1,0,5770,3 -4,primals_5,placeholder,primals_5,unknown,,0,0,1,0,5757,3 -5,primals_6,placeholder,primals_6,unknown,,0,0,1,0,5737,3 -6,primals_7,placeholder,primals_7,unknown,,0,0,1,0,5714,3 -7,primals_8,placeholder,primals_8,unknown,,0,0,1,0,5718,3 -8,primals_9,placeholder,primals_9,unknown,,0,0,1,0,5794,2 -9,primals_10,placeholder,primals_10,unknown,,0,0,1,0,5741,2 -10,primals_11,placeholder,primals_11,unknown,,0,0,1,0,5681,3 -11,primals_12,placeholder,primals_12,unknown,,0,0,1,0,5681,3 -12,primals_13,placeholder,primals_13,unknown,,0,0,1,0,5674,3 -13,primals_14,placeholder,primals_14,unknown,,0,0,1,0,5661,3 -14,primals_15,placeholder,primals_15,unknown,,0,0,1,0,5641,3 -15,primals_16,placeholder,primals_16,unknown,,0,0,1,0,5618,3 -16,primals_17,placeholder,primals_17,unknown,,0,0,1,0,5622,3 -17,primals_18,placeholder,primals_18,unknown,,0,0,1,0,5698,2 -18,primals_19,placeholder,primals_19,unknown,,0,0,1,0,5645,2 -19,primals_20,placeholder,primals_20,unknown,,0,0,1,0,5585,3 -20,primals_21,placeholder,primals_21,unknown,,0,0,1,0,5585,3 -21,primals_22,placeholder,primals_22,unknown,,0,0,1,0,5578,3 -22,primals_23,placeholder,primals_23,unknown,,0,0,1,0,5565,3 -23,primals_24,placeholder,primals_24,unknown,,0,0,1,0,5545,3 -24,primals_25,placeholder,primals_25,unknown,,0,0,1,0,5522,3 -25,primals_26,placeholder,primals_26,unknown,,0,0,1,0,5526,3 -26,primals_27,placeholder,primals_27,unknown,,0,0,1,0,5602,2 -27,primals_28,placeholder,primals_28,unknown,,0,0,1,0,5549,2 -28,primals_29,placeholder,primals_29,unknown,,0,0,1,0,5489,3 -29,primals_30,placeholder,primals_30,unknown,,0,0,1,0,5489,3 -30,primals_31,placeholder,primals_31,unknown,,0,0,1,0,5482,3 -31,primals_32,placeholder,primals_32,unknown,,0,0,1,0,5469,3 -32,primals_33,placeholder,primals_33,unknown,,0,0,1,0,5449,3 -33,primals_34,placeholder,primals_34,unknown,,0,0,1,0,5426,3 -34,primals_35,placeholder,primals_35,unknown,,0,0,1,0,5430,3 -35,primals_36,placeholder,primals_36,unknown,,0,0,1,0,5506,2 -36,primals_37,placeholder,primals_37,unknown,,0,0,1,0,5453,2 -37,primals_38,placeholder,primals_38,unknown,,0,0,1,0,5393,3 -38,primals_39,placeholder,primals_39,unknown,,0,0,1,0,5393,3 -39,primals_40,placeholder,primals_40,unknown,,0,0,1,0,5386,3 -40,primals_41,placeholder,primals_41,unknown,,0,0,1,0,5373,3 -41,primals_42,placeholder,primals_42,unknown,,0,0,1,0,5353,3 -42,primals_43,placeholder,primals_43,unknown,,0,0,1,0,5330,3 -43,primals_44,placeholder,primals_44,unknown,,0,0,1,0,5334,3 -44,primals_45,placeholder,primals_45,unknown,,0,0,1,0,5410,2 -45,primals_46,placeholder,primals_46,unknown,,0,0,1,0,5357,2 -46,primals_47,placeholder,primals_47,unknown,,0,0,1,0,5297,3 -47,primals_48,placeholder,primals_48,unknown,,0,0,1,0,5297,3 -48,primals_49,placeholder,primals_49,unknown,,0,0,1,0,5290,3 -49,primals_50,placeholder,primals_50,unknown,,0,0,1,0,5277,3 -50,primals_51,placeholder,primals_51,unknown,,0,0,1,0,5257,3 -51,primals_52,placeholder,primals_52,unknown,,0,0,1,0,5234,3 -52,primals_53,placeholder,primals_53,unknown,,0,0,1,0,5238,3 -53,primals_54,placeholder,primals_54,unknown,,0,0,1,0,5314,2 -54,primals_55,placeholder,primals_55,unknown,,0,0,1,0,5261,2 -55,primals_56,placeholder,primals_56,unknown,,0,0,1,0,5201,3 -56,primals_57,placeholder,primals_57,unknown,,0,0,1,0,5201,3 -57,primals_58,placeholder,primals_58,unknown,,0,0,1,0,5194,3 -58,primals_59,placeholder,primals_59,unknown,,0,0,1,0,5181,3 -59,primals_60,placeholder,primals_60,unknown,,0,0,1,0,5161,3 -60,primals_61,placeholder,primals_61,unknown,,0,0,1,0,5138,3 -61,primals_62,placeholder,primals_62,unknown,,0,0,1,0,5142,3 -62,primals_63,placeholder,primals_63,unknown,,0,0,1,0,5218,2 -63,primals_64,placeholder,primals_64,unknown,,0,0,1,0,5165,2 -64,primals_65,placeholder,primals_65,unknown,,0,0,1,0,5105,3 -65,primals_66,placeholder,primals_66,unknown,,0,0,1,0,5105,3 -66,primals_67,placeholder,primals_67,unknown,,0,0,1,0,5098,3 -67,primals_68,placeholder,primals_68,unknown,,0,0,1,0,5085,3 -68,primals_69,placeholder,primals_69,unknown,,0,0,1,0,5065,3 -69,primals_70,placeholder,primals_70,unknown,,0,0,1,0,5042,3 -70,primals_71,placeholder,primals_71,unknown,,0,0,1,0,5046,3 -71,primals_72,placeholder,primals_72,unknown,,0,0,1,0,5122,2 -72,primals_73,placeholder,primals_73,unknown,,0,0,1,0,5069,2 -73,primals_74,placeholder,primals_74,unknown,,0,0,1,0,5009,3 -74,primals_75,placeholder,primals_75,unknown,,0,0,1,0,5009,3 -75,primals_76,placeholder,primals_76,unknown,,0,0,1,0,5002,3 -76,primals_77,placeholder,primals_77,unknown,,0,0,1,0,4989,3 -77,primals_78,placeholder,primals_78,unknown,,0,0,1,0,4969,3 -78,primals_79,placeholder,primals_79,unknown,,0,0,1,0,4946,3 -79,primals_80,placeholder,primals_80,unknown,,0,0,1,0,4950,3 -80,primals_81,placeholder,primals_81,unknown,,0,0,1,0,5026,2 -81,primals_82,placeholder,primals_82,unknown,,0,0,1,0,4973,2 -82,primals_83,placeholder,primals_83,unknown,,0,0,1,0,4913,3 -83,primals_84,placeholder,primals_84,unknown,,0,0,1,0,4913,3 -84,primals_85,placeholder,primals_85,unknown,,0,0,1,0,4906,3 -85,primals_86,placeholder,primals_86,unknown,,0,0,1,0,4893,3 -86,primals_87,placeholder,primals_87,unknown,,0,0,1,0,4873,3 -87,primals_88,placeholder,primals_88,unknown,,0,0,1,0,4850,3 -88,primals_89,placeholder,primals_89,unknown,,0,0,1,0,4854,3 -89,primals_90,placeholder,primals_90,unknown,,0,0,1,0,4930,2 -90,primals_91,placeholder,primals_91,unknown,,0,0,1,0,4877,2 -91,primals_92,placeholder,primals_92,unknown,,0,0,1,0,4817,3 -92,primals_93,placeholder,primals_93,unknown,,0,0,1,0,4817,3 -93,primals_94,placeholder,primals_94,unknown,,0,0,1,0,4810,3 -94,primals_95,placeholder,primals_95,unknown,,0,0,1,0,4797,3 -95,primals_96,placeholder,primals_96,unknown,,0,0,1,0,4777,3 -96,primals_97,placeholder,primals_97,unknown,,0,0,1,0,4754,3 -97,primals_98,placeholder,primals_98,unknown,,0,0,1,0,4758,3 -98,primals_99,placeholder,primals_99,unknown,,0,0,1,0,4834,2 -99,primals_100,placeholder,primals_100,unknown,,0,0,1,0,4781,2 -100,primals_101,placeholder,primals_101,unknown,,0,0,1,0,4721,3 -101,primals_102,placeholder,primals_102,unknown,,0,0,1,0,4721,3 -102,primals_103,placeholder,primals_103,unknown,,0,0,1,0,4714,3 -103,primals_104,placeholder,primals_104,unknown,,0,0,1,0,4701,3 -104,primals_105,placeholder,primals_105,unknown,,0,0,1,0,4681,3 -105,primals_106,placeholder,primals_106,unknown,,0,0,1,0,4658,3 -106,primals_107,placeholder,primals_107,unknown,,0,0,1,0,4662,3 -107,primals_108,placeholder,primals_108,unknown,,0,0,1,0,4738,2 -108,primals_109,placeholder,primals_109,unknown,,0,0,1,0,4685,2 -109,primals_110,placeholder,primals_110,unknown,,0,0,1,0,4625,3 -110,primals_111,placeholder,primals_111,unknown,,0,0,1,0,4625,3 -111,primals_112,placeholder,primals_112,unknown,,0,0,1,0,4618,3 -112,primals_113,placeholder,primals_113,unknown,,0,0,1,0,4605,3 -113,primals_114,placeholder,primals_114,unknown,,0,0,1,0,4585,3 -114,primals_115,placeholder,primals_115,unknown,,0,0,1,0,4562,3 -115,primals_116,placeholder,primals_116,unknown,,0,0,1,0,4566,3 -116,primals_117,placeholder,primals_117,unknown,,0,0,1,0,4642,2 -117,primals_118,placeholder,primals_118,unknown,,0,0,1,0,4589,2 -118,primals_119,placeholder,primals_119,unknown,,0,0,1,0,4529,3 -119,primals_120,placeholder,primals_120,unknown,,0,0,1,0,4529,3 -120,primals_121,placeholder,primals_121,unknown,,0,0,1,0,4522,3 -121,primals_122,placeholder,primals_122,unknown,,0,0,1,0,4509,3 -122,primals_123,placeholder,primals_123,unknown,,0,0,1,0,4489,3 -123,primals_124,placeholder,primals_124,unknown,,0,0,1,0,4466,3 -124,primals_125,placeholder,primals_125,unknown,,0,0,1,0,4470,3 -125,primals_126,placeholder,primals_126,unknown,,0,0,1,0,4546,2 -126,primals_127,placeholder,primals_127,unknown,,0,0,1,0,4493,2 -127,primals_128,placeholder,primals_128,unknown,,0,0,1,0,4433,3 -128,primals_129,placeholder,primals_129,unknown,,0,0,1,0,4433,3 -129,primals_130,placeholder,primals_130,unknown,,0,0,1,0,4426,3 -130,primals_131,placeholder,primals_131,unknown,,0,0,1,0,4413,3 -131,primals_132,placeholder,primals_132,unknown,,0,0,1,0,4393,3 -132,primals_133,placeholder,primals_133,unknown,,0,0,1,0,4370,3 -133,primals_134,placeholder,primals_134,unknown,,0,0,1,0,4374,3 -134,primals_135,placeholder,primals_135,unknown,,0,0,1,0,4450,2 -135,primals_136,placeholder,primals_136,unknown,,0,0,1,0,4397,2 -136,primals_137,placeholder,primals_137,unknown,,0,0,1,0,4337,3 -137,primals_138,placeholder,primals_138,unknown,,0,0,1,0,4337,3 -138,primals_139,placeholder,primals_139,unknown,,0,0,1,0,4330,3 -139,primals_140,placeholder,primals_140,unknown,,0,0,1,0,4317,3 -140,primals_141,placeholder,primals_141,unknown,,0,0,1,0,4297,3 -141,primals_142,placeholder,primals_142,unknown,,0,0,1,0,4274,3 -142,primals_143,placeholder,primals_143,unknown,,0,0,1,0,4278,3 -143,primals_144,placeholder,primals_144,unknown,,0,0,1,0,4354,2 -144,primals_145,placeholder,primals_145,unknown,,0,0,1,0,4301,2 -145,primals_146,placeholder,primals_146,unknown,,0,0,1,0,4241,3 -146,primals_147,placeholder,primals_147,unknown,,0,0,1,0,4241,3 -147,primals_148,placeholder,primals_148,unknown,,0,0,1,0,4234,3 -148,primals_149,placeholder,primals_149,unknown,,0,0,1,0,4221,3 -149,primals_150,placeholder,primals_150,unknown,,0,0,1,0,4201,3 -150,primals_151,placeholder,primals_151,unknown,,0,0,1,0,4178,3 -151,primals_152,placeholder,primals_152,unknown,,0,0,1,0,4182,3 -152,primals_153,placeholder,primals_153,unknown,,0,0,1,0,4258,2 -153,primals_154,placeholder,primals_154,unknown,,0,0,1,0,4205,2 -154,primals_155,placeholder,primals_155,unknown,,0,0,1,0,4145,3 -155,primals_156,placeholder,primals_156,unknown,,0,0,1,0,4145,3 -156,primals_157,placeholder,primals_157,unknown,,0,0,1,0,4138,3 -157,primals_158,placeholder,primals_158,unknown,,0,0,1,0,4125,3 -158,primals_159,placeholder,primals_159,unknown,,0,0,1,0,4105,3 -159,primals_160,placeholder,primals_160,unknown,,0,0,1,0,4082,3 -160,primals_161,placeholder,primals_161,unknown,,0,0,1,0,4086,3 -161,primals_162,placeholder,primals_162,unknown,,0,0,1,0,4162,2 -162,primals_163,placeholder,primals_163,unknown,,0,0,1,0,4109,2 -163,primals_164,placeholder,primals_164,unknown,,0,0,1,0,4049,3 -164,primals_165,placeholder,primals_165,unknown,,0,0,1,0,4049,3 -165,primals_166,placeholder,primals_166,unknown,,0,0,1,0,4042,3 -166,primals_167,placeholder,primals_167,unknown,,0,0,1,0,4029,3 -167,primals_168,placeholder,primals_168,unknown,,0,0,1,0,4009,3 -168,primals_169,placeholder,primals_169,unknown,,0,0,1,0,3986,3 -169,primals_170,placeholder,primals_170,unknown,,0,0,1,0,3990,3 -170,primals_171,placeholder,primals_171,unknown,,0,0,1,0,4066,2 -171,primals_172,placeholder,primals_172,unknown,,0,0,1,0,4013,2 -172,primals_173,placeholder,primals_173,unknown,,0,0,1,0,3953,3 -173,primals_174,placeholder,primals_174,unknown,,0,0,1,0,3953,3 -174,primals_175,placeholder,primals_175,unknown,,0,0,1,0,3946,3 -175,primals_176,placeholder,primals_176,unknown,,0,0,1,0,3933,3 -176,primals_177,placeholder,primals_177,unknown,,0,0,1,0,3913,3 -177,primals_178,placeholder,primals_178,unknown,,0,0,1,0,3890,3 -178,primals_179,placeholder,primals_179,unknown,,0,0,1,0,3894,3 -179,primals_180,placeholder,primals_180,unknown,,0,0,1,0,3970,2 -180,primals_181,placeholder,primals_181,unknown,,0,0,1,0,3917,2 -181,primals_182,placeholder,primals_182,unknown,,0,0,1,0,3857,3 -182,primals_183,placeholder,primals_183,unknown,,0,0,1,0,3857,3 -183,primals_184,placeholder,primals_184,unknown,,0,0,1,0,3850,3 -184,primals_185,placeholder,primals_185,unknown,,0,0,1,0,3837,3 -185,primals_186,placeholder,primals_186,unknown,,0,0,1,0,3817,3 -186,primals_187,placeholder,primals_187,unknown,,0,0,1,0,3794,3 -187,primals_188,placeholder,primals_188,unknown,,0,0,1,0,3798,3 -188,primals_189,placeholder,primals_189,unknown,,0,0,1,0,3874,2 -189,primals_190,placeholder,primals_190,unknown,,0,0,1,0,3821,2 -190,primals_191,placeholder,primals_191,unknown,,0,0,1,0,3761,3 -191,primals_192,placeholder,primals_192,unknown,,0,0,1,0,3761,3 -192,primals_193,placeholder,primals_193,unknown,,0,0,1,0,3754,3 -193,primals_194,placeholder,primals_194,unknown,,0,0,1,0,3741,3 -194,primals_195,placeholder,primals_195,unknown,,0,0,1,0,3721,3 -195,primals_196,placeholder,primals_196,unknown,,0,0,1,0,3698,3 -196,primals_197,placeholder,primals_197,unknown,,0,0,1,0,3702,3 -197,primals_198,placeholder,primals_198,unknown,,0,0,1,0,3778,2 -198,primals_199,placeholder,primals_199,unknown,,0,0,1,0,3725,2 -199,primals_200,placeholder,primals_200,unknown,,0,0,1,0,3665,3 -200,primals_201,placeholder,primals_201,unknown,,0,0,1,0,3665,3 -201,primals_202,placeholder,primals_202,unknown,,0,0,1,0,3658,3 -202,primals_203,placeholder,primals_203,unknown,,0,0,1,0,3645,3 -203,primals_204,placeholder,primals_204,unknown,,0,0,1,0,3625,3 -204,primals_205,placeholder,primals_205,unknown,,0,0,1,0,3602,3 -205,primals_206,placeholder,primals_206,unknown,,0,0,1,0,3606,3 -206,primals_207,placeholder,primals_207,unknown,,0,0,1,0,3682,2 -207,primals_208,placeholder,primals_208,unknown,,0,0,1,0,3629,2 -208,primals_209,placeholder,primals_209,unknown,,0,0,1,0,3569,3 -209,primals_210,placeholder,primals_210,unknown,,0,0,1,0,3569,3 -210,primals_211,placeholder,primals_211,unknown,,0,0,1,0,3562,3 -211,primals_212,placeholder,primals_212,unknown,,0,0,1,0,3549,3 -212,primals_213,placeholder,primals_213,unknown,,0,0,1,0,3529,3 -213,primals_214,placeholder,primals_214,unknown,,0,0,1,0,3506,3 -214,primals_215,placeholder,primals_215,unknown,,0,0,1,0,3510,3 -215,primals_216,placeholder,primals_216,unknown,,0,0,1,0,3586,2 -216,primals_217,placeholder,primals_217,unknown,,0,0,1,0,3533,2 -217,primals_218,placeholder,primals_218,unknown,,0,0,1,0,3473,3 -218,primals_219,placeholder,primals_219,unknown,,0,0,1,0,3473,3 -219,primals_220,placeholder,primals_220,unknown,,0,0,1,0,3466,3 -220,primals_221,placeholder,primals_221,unknown,,0,0,1,0,3453,3 -221,primals_222,placeholder,primals_222,unknown,,0,0,1,0,3433,3 -222,primals_223,placeholder,primals_223,unknown,,0,0,1,0,3410,3 -223,primals_224,placeholder,primals_224,unknown,,0,0,1,0,3414,3 -224,primals_225,placeholder,primals_225,unknown,,0,0,1,0,3490,2 -225,primals_226,placeholder,primals_226,unknown,,0,0,1,0,3437,2 -226,primals_227,placeholder,primals_227,unknown,,0,0,1,0,3377,3 -227,primals_228,placeholder,primals_228,unknown,,0,0,1,0,3377,3 -228,primals_229,placeholder,primals_229,unknown,,0,0,1,0,3370,3 -229,primals_230,placeholder,primals_230,unknown,,0,0,1,0,3357,3 -230,primals_231,placeholder,primals_231,unknown,,0,0,1,0,3337,3 -231,primals_232,placeholder,primals_232,unknown,,0,0,1,0,3314,3 -232,primals_233,placeholder,primals_233,unknown,,0,0,1,0,3318,3 -233,primals_234,placeholder,primals_234,unknown,,0,0,1,0,3394,2 -234,primals_235,placeholder,primals_235,unknown,,0,0,1,0,3341,2 -235,primals_236,placeholder,primals_236,unknown,,0,0,1,0,3281,3 -236,primals_237,placeholder,primals_237,unknown,,0,0,1,0,3281,3 -237,primals_238,placeholder,primals_238,unknown,,0,0,1,0,3274,3 -238,primals_239,placeholder,primals_239,unknown,,0,0,1,0,3261,3 -239,primals_240,placeholder,primals_240,unknown,,0,0,1,0,3241,3 -240,primals_241,placeholder,primals_241,unknown,,0,0,1,0,3218,3 -241,primals_242,placeholder,primals_242,unknown,,0,0,1,0,3222,3 -242,primals_243,placeholder,primals_243,unknown,,0,0,1,0,3298,2 -243,primals_244,placeholder,primals_244,unknown,,0,0,1,0,3245,2 -244,primals_245,placeholder,primals_245,unknown,,0,0,1,0,3185,3 -245,primals_246,placeholder,primals_246,unknown,,0,0,1,0,3185,3 -246,primals_247,placeholder,primals_247,unknown,,0,0,1,0,3178,3 -247,primals_248,placeholder,primals_248,unknown,,0,0,1,0,3165,3 -248,primals_249,placeholder,primals_249,unknown,,0,0,1,0,3145,3 -249,primals_250,placeholder,primals_250,unknown,,0,0,1,0,3122,3 -250,primals_251,placeholder,primals_251,unknown,,0,0,1,0,3126,3 -251,primals_252,placeholder,primals_252,unknown,,0,0,1,0,3202,2 -252,primals_253,placeholder,primals_253,unknown,,0,0,1,0,3149,2 -253,primals_254,placeholder,primals_254,unknown,,0,0,1,0,3103,2 -254,primals_255,placeholder,primals_255,unknown,,0,0,1,0,5943,3 -255,primals_256,placeholder,primals_256,unknown,,0,0,1,0,5806,3 -256,tangents_1,placeholder,tangents_1,backward,,0,0,1,0,3104,4 -257,alias_default,call_function,alias.default,unknown,,1,1,2,1,5815,3 -258,dtype_cast,call_function,dtype_cast.default,forward,,1,1,1,2,5805,3 -259,alias_default_2,call_function,alias.default,unknown,,1,1,2,1,5805,3 -260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5 -261,dtype_cast_1,call_function,dtype_cast.default,forward,0,1,1,1,1,5793,2 -262,alias_default_4,call_function,alias.default,forward,,1,1,3,6,5803,4 -263,convert_element_type,call_function,convert_element_type.default,forward,0,1,1,1,7,5801,4 -264,alias_default_6,call_function,alias.default,forward,0,1,1,2,8,5800,4 -265,pow_1,call_function,pow.Tensor_Scalar,forward,0,1,1,1,9,5799,4 -266,mean,call_function,mean.dim,forward,0,1,1,1,10,5798,4 -267,add,call_function,add.Scalar,forward,0,1,1,1,11,5797,3 -268,rsqrt,call_function,rsqrt.default,forward,0,1,1,1,12,5796,3 -269,alias_default_7,call_function,alias.default,forward,0,1,1,3,13,5795,3 -270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8 -271,alias_default_5,call_function,alias.default,forward,0,1,1,2,2,5792,2 -272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8 -273,convert_element_type_1,call_function,convert_element_type.default,forward,0,1,1,1,19,5789,6 -274,dtype_cast_2,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3 -275,permute,call_function,permute.default,forward,0,1,1,1,2,5775,3 -276,alias_default_8,call_function,alias.default,forward,0,1,1,6,20,5788,4 -277,alias_default_9,call_function,alias.default,forward,0,1,1,2,3,5774,3 -278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5 -279,dtype_cast_3,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3 -280,permute_1,call_function,permute.default,forward,0,1,1,1,2,5775,3 -281,alias_default_10,call_function,alias.default,forward,0,1,1,2,3,5774,3 -282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5 -283,dtype_cast_4,call_function,dtype_cast.default,forward,0,1,1,1,1,5769,3 -284,permute_2,call_function,permute.default,forward,0,1,1,1,2,5768,3 -285,alias_default_11,call_function,alias.default,forward,0,1,1,2,3,5767,3 -286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5 -287,view_6,call_function,view.default,forward,0,1,1,1,26,5771,4 -288,view_7,call_function,view.default,forward,0,1,1,1,26,5771,4 -289,view_8,call_function,view.default,forward,0,1,1,1,26,5764,4 -290,convert_element_type_8,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4 -291,view_9,call_function,view.default,forward,0,1,1,1,28,5769,4 -292,view_as_complex,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6 -293,convert_element_type_9,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4 -294,view_10,call_function,view.default,forward,0,1,1,1,28,5769,4 -295,view_as_complex_1,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6 -296,alias_default_1,call_function,alias.default,unknown,,1,1,28,1,5942,3 -297,view_11,call_function,view.default,forward,0,1,1,1,2,5779,3 -298,alias_default_12,call_function,alias.default,forward,0,1,1,4,3,5778,3 -299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 -300,view_as_real,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6 -301,view_12,call_function,view.default,forward,0,1,1,1,36,5765,6 -302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 -303,view_as_real_1,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6 -304,view_13,call_function,view.default,forward,0,1,1,1,36,5765,6 -305,convert_element_type_10,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6 -306,convert_element_type_11,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6 -307,permute_3,call_function,permute.default,forward,0,1,1,1,38,5763,6 -308,permute_4,call_function,permute.default,forward,0,1,1,1,38,5763,6 -309,permute_5,call_function,permute.default,forward,0,1,1,1,27,5763,4 -310,alias_default_13,call_function,alias.default,forward,0,1,1,2,39,5762,4 -311,alias_default_14,call_function,alias.default,forward,0,1,1,2,39,5762,4 -312,alias_default_15,call_function,alias.default,forward,0,1,1,2,28,5762,4 -313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2 -314,getitem,call_function,getitem,forward,0,1,1,1,64,5757,2 -315,getitem_1,call_function,getitem,forward,0,1,1,1,64,64,2 -316,getitem_6,call_function,getitem,forward,0,1,1,1,64,64,1 -317,getitem_7,call_function,getitem,forward,0,1,1,1,64,64,1 -318,alias_default_16,call_function,alias.default,forward,0,1,1,2,65,5756,4 -319,permute_6,call_function,permute.default,forward,0,1,1,1,66,5755,4 -320,view_14,call_function,view.default,forward,0,1,1,1,67,5754,3 -321,dtype_cast_5,call_function,dtype_cast.default,forward,0,1,1,1,1,5756,3 -322,permute_7,call_function,permute.default,forward,0,1,1,1,2,5755,3 -323,alias_default_17,call_function,alias.default,forward,0,1,1,2,68,5753,4 -324,alias_default_18,call_function,alias.default,forward,0,1,1,2,3,5754,3 -325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5 -326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10 -327,dtype_cast_6,call_function,dtype_cast.default,forward,0,1,1,1,1,5740,2 -328,alias_default_19,call_function,alias.default,forward,0,1,1,3,75,5750,4 -329,convert_element_type_14,call_function,convert_element_type.default,forward,0,1,1,1,76,5748,4 -330,alias_default_21,call_function,alias.default,forward,0,1,1,2,77,5747,4 -331,pow_2,call_function,pow.Tensor_Scalar,forward,0,1,1,1,78,5746,4 -332,mean_1,call_function,mean.dim,forward,0,1,1,1,79,5745,4 -333,add_2,call_function,add.Scalar,forward,0,1,1,1,80,5744,3 -334,rsqrt_1,call_function,rsqrt.default,forward,0,1,1,1,81,5743,3 -335,alias_default_22,call_function,alias.default,forward,0,1,1,3,82,5742,3 -336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8 -337,alias_default_20,call_function,alias.default,forward,0,1,1,2,2,5739,2 -338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8 -339,convert_element_type_15,call_function,convert_element_type.default,forward,0,1,1,1,88,5736,6 -340,dtype_cast_7,call_function,dtype_cast.default,forward,0,1,1,1,1,5736,3 -341,permute_8,call_function,permute.default,forward,0,1,1,1,2,5735,3 -342,alias_default_23,call_function,alias.default,forward,0,1,1,4,89,5735,4 -343,alias_default_24,call_function,alias.default,forward,0,1,1,2,3,5734,3 -344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5 -345,alias_default_25,call_function,alias.default,forward,0,1,1,2,95,5731,4 -346,convert_element_type_18,call_function,convert_element_type.default,forward,0,1,1,1,96,5719,4 -347,alias_default_26,call_function,alias.default,forward,0,1,1,2,97,5718,4 -348,neg,call_function,neg.default,forward,0,1,1,1,98,5717,8 -349,exp,call_function,exp.default,forward,0,1,1,1,99,5716,6 -350,add_3,call_function,add.Tensor,forward,0,1,1,1,100,5715,4 -351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6 -352,convert_element_type_19,call_function,convert_element_type.default,forward,0,1,1,1,102,5713,6 -353,dtype_cast_8,call_function,dtype_cast.default,forward,0,1,1,1,1,5717,3 -354,permute_9,call_function,permute.default,forward,0,1,1,1,2,5716,3 -355,alias_default_28,call_function,alias.default,forward,0,1,1,2,3,5715,3 -356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5 -357,alias_default_27,call_function,alias.default,forward,0,1,1,2,103,5712,4 -358,alias_default_29,call_function,alias.default,forward,0,1,1,2,95,5712,4 -359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8 -360,dtype_cast_9,call_function,dtype_cast.default,forward,0,1,1,1,1,5713,3 -361,permute_10,call_function,permute.default,forward,0,1,1,1,2,5712,3 -362,alias_default_30,call_function,alias.default,forward,0,1,1,2,111,5710,4 -363,alias_default_31,call_function,alias.default,forward,0,1,1,2,3,5711,3 -364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5 -365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10 -366,dtype_cast_10,call_function,dtype_cast.default,forward,1,1,1,1,1,5697,2 -367,alias_default_32,call_function,alias.default,forward,0,1,1,3,118,5707,4 -368,convert_element_type_24,call_function,convert_element_type.default,forward,1,1,1,1,119,5705,4 -369,alias_default_34,call_function,alias.default,forward,1,1,1,2,120,5704,4 -370,pow_3,call_function,pow.Tensor_Scalar,forward,1,1,1,1,121,5703,4 -371,mean_2,call_function,mean.dim,forward,1,1,1,1,122,5702,4 -372,add_5,call_function,add.Scalar,forward,1,1,1,1,123,5701,3 -373,rsqrt_2,call_function,rsqrt.default,forward,1,1,1,1,124,5700,3 -374,alias_default_35,call_function,alias.default,forward,1,1,1,3,125,5699,3 -375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8 -376,alias_default_33,call_function,alias.default,forward,1,1,1,2,2,5696,2 -377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8 -378,convert_element_type_25,call_function,convert_element_type.default,forward,1,1,1,1,131,5693,6 -379,dtype_cast_11,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3 -380,permute_11,call_function,permute.default,forward,1,1,1,1,2,5679,3 -381,alias_default_36,call_function,alias.default,forward,1,1,1,6,132,5692,4 -382,alias_default_37,call_function,alias.default,forward,1,1,1,2,3,5678,3 -383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5 -384,dtype_cast_12,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3 -385,permute_12,call_function,permute.default,forward,1,1,1,1,2,5679,3 -386,alias_default_38,call_function,alias.default,forward,1,1,1,2,3,5678,3 -387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5 -388,dtype_cast_13,call_function,dtype_cast.default,forward,1,1,1,1,1,5673,3 -389,permute_13,call_function,permute.default,forward,1,1,1,1,2,5672,3 -390,alias_default_39,call_function,alias.default,forward,1,1,1,2,3,5671,3 -391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5 -392,view_29,call_function,view.default,forward,1,1,1,1,138,5675,4 -393,view_30,call_function,view.default,forward,1,1,1,1,138,5675,4 -394,view_31,call_function,view.default,forward,1,1,1,1,138,5668,4 -395,convert_element_type_32,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4 -396,view_32,call_function,view.default,forward,1,1,1,1,140,5673,4 -397,view_as_complex_2,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6 -398,convert_element_type_33,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4 -399,view_33,call_function,view.default,forward,1,1,1,1,140,5673,4 -400,view_as_complex_3,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6 -401,view_34,call_function,view.default,forward,1,1,1,1,2,5683,3 -402,alias_default_40,call_function,alias.default,forward,1,1,1,4,3,5682,3 -403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 -404,view_as_real_2,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6 -405,view_35,call_function,view.default,forward,1,1,1,1,146,5669,6 -406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 -407,view_as_real_3,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6 -408,view_36,call_function,view.default,forward,1,1,1,1,146,5669,6 -409,convert_element_type_34,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6 -410,convert_element_type_35,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6 -411,permute_14,call_function,permute.default,forward,1,1,1,1,148,5667,6 -412,permute_15,call_function,permute.default,forward,1,1,1,1,148,5667,6 -413,permute_16,call_function,permute.default,forward,1,1,1,1,139,5667,4 -414,alias_default_41,call_function,alias.default,forward,1,1,1,2,149,5666,4 -415,alias_default_42,call_function,alias.default,forward,1,1,1,2,149,5666,4 -416,alias_default_43,call_function,alias.default,forward,1,1,1,2,140,5666,4 -417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2 -418,getitem_9,call_function,getitem,forward,1,1,1,1,174,5661,2 -419,getitem_10,call_function,getitem,forward,1,1,1,1,174,174,2 -420,getitem_15,call_function,getitem,forward,1,1,1,1,174,174,1 -421,getitem_16,call_function,getitem,forward,1,1,1,1,174,174,1 -422,alias_default_44,call_function,alias.default,forward,1,1,1,2,175,5660,4 -423,permute_17,call_function,permute.default,forward,1,1,1,1,176,5659,4 -424,view_37,call_function,view.default,forward,1,1,1,1,177,5658,3 -425,dtype_cast_14,call_function,dtype_cast.default,forward,1,1,1,1,1,5660,3 -426,permute_18,call_function,permute.default,forward,1,1,1,1,2,5659,3 -427,alias_default_45,call_function,alias.default,forward,1,1,1,2,178,5657,4 -428,alias_default_46,call_function,alias.default,forward,1,1,1,2,3,5658,3 -429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5 -430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10 -431,dtype_cast_15,call_function,dtype_cast.default,forward,1,1,1,1,1,5644,2 -432,alias_default_47,call_function,alias.default,forward,1,1,1,3,185,5654,4 -433,convert_element_type_38,call_function,convert_element_type.default,forward,1,1,1,1,186,5652,4 -434,alias_default_49,call_function,alias.default,forward,1,1,1,2,187,5651,4 -435,pow_4,call_function,pow.Tensor_Scalar,forward,1,1,1,1,188,5650,4 -436,mean_3,call_function,mean.dim,forward,1,1,1,1,189,5649,4 -437,add_7,call_function,add.Scalar,forward,1,1,1,1,190,5648,3 -438,rsqrt_3,call_function,rsqrt.default,forward,1,1,1,1,191,5647,3 -439,alias_default_50,call_function,alias.default,forward,1,1,1,3,192,5646,3 -440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8 -441,alias_default_48,call_function,alias.default,forward,1,1,1,2,2,5643,2 -442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8 -443,convert_element_type_39,call_function,convert_element_type.default,forward,1,1,1,1,198,5640,6 -444,dtype_cast_16,call_function,dtype_cast.default,forward,1,1,1,1,1,5640,3 -445,permute_19,call_function,permute.default,forward,1,1,1,1,2,5639,3 -446,alias_default_51,call_function,alias.default,forward,1,1,1,4,199,5639,4 -447,alias_default_52,call_function,alias.default,forward,1,1,1,2,3,5638,3 -448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5 -449,alias_default_53,call_function,alias.default,forward,1,1,1,2,205,5635,4 -450,convert_element_type_42,call_function,convert_element_type.default,forward,1,1,1,1,206,5623,4 -451,alias_default_54,call_function,alias.default,forward,1,1,1,2,207,5622,4 -452,neg_1,call_function,neg.default,forward,1,1,1,1,208,5621,8 -453,exp_1,call_function,exp.default,forward,1,1,1,1,209,5620,6 -454,add_8,call_function,add.Tensor,forward,1,1,1,1,210,5619,4 -455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6 -456,convert_element_type_43,call_function,convert_element_type.default,forward,1,1,1,1,212,5617,6 -457,dtype_cast_17,call_function,dtype_cast.default,forward,1,1,1,1,1,5621,3 -458,permute_20,call_function,permute.default,forward,1,1,1,1,2,5620,3 -459,alias_default_56,call_function,alias.default,forward,1,1,1,2,3,5619,3 -460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5 -461,alias_default_55,call_function,alias.default,forward,1,1,1,2,213,5616,4 -462,alias_default_57,call_function,alias.default,forward,1,1,1,2,205,5616,4 -463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8 -464,dtype_cast_18,call_function,dtype_cast.default,forward,1,1,1,1,1,5617,3 -465,permute_21,call_function,permute.default,forward,1,1,1,1,2,5616,3 -466,alias_default_58,call_function,alias.default,forward,1,1,1,2,221,5614,4 -467,alias_default_59,call_function,alias.default,forward,1,1,1,2,3,5615,3 -468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5 -469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10 -470,dtype_cast_19,call_function,dtype_cast.default,forward,2,1,1,1,1,5601,2 -471,alias_default_60,call_function,alias.default,forward,1,1,1,3,228,5611,4 -472,convert_element_type_48,call_function,convert_element_type.default,forward,2,1,1,1,229,5609,4 -473,alias_default_62,call_function,alias.default,forward,2,1,1,2,230,5608,4 -474,pow_5,call_function,pow.Tensor_Scalar,forward,2,1,1,1,231,5607,4 -475,mean_4,call_function,mean.dim,forward,2,1,1,1,232,5606,4 -476,add_10,call_function,add.Scalar,forward,2,1,1,1,233,5605,3 -477,rsqrt_4,call_function,rsqrt.default,forward,2,1,1,1,234,5604,3 -478,alias_default_63,call_function,alias.default,forward,2,1,1,3,235,5603,3 -479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8 -480,alias_default_61,call_function,alias.default,forward,2,1,1,2,2,5600,2 -481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8 -482,convert_element_type_49,call_function,convert_element_type.default,forward,2,1,1,1,241,5597,6 -483,dtype_cast_20,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3 -484,permute_22,call_function,permute.default,forward,2,1,1,1,2,5583,3 -485,alias_default_64,call_function,alias.default,forward,2,1,1,6,242,5596,4 -486,alias_default_65,call_function,alias.default,forward,2,1,1,2,3,5582,3 -487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5 -488,dtype_cast_21,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3 -489,permute_23,call_function,permute.default,forward,2,1,1,1,2,5583,3 -490,alias_default_66,call_function,alias.default,forward,2,1,1,2,3,5582,3 -491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5 -492,dtype_cast_22,call_function,dtype_cast.default,forward,2,1,1,1,1,5577,3 -493,permute_24,call_function,permute.default,forward,2,1,1,1,2,5576,3 -494,alias_default_67,call_function,alias.default,forward,2,1,1,2,3,5575,3 -495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5 -496,view_52,call_function,view.default,forward,2,1,1,1,248,5579,4 -497,view_53,call_function,view.default,forward,2,1,1,1,248,5579,4 -498,view_54,call_function,view.default,forward,2,1,1,1,248,5572,4 -499,convert_element_type_56,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4 -500,view_55,call_function,view.default,forward,2,1,1,1,250,5577,4 -501,view_as_complex_4,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6 -502,convert_element_type_57,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4 -503,view_56,call_function,view.default,forward,2,1,1,1,250,5577,4 -504,view_as_complex_5,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6 -505,view_57,call_function,view.default,forward,2,1,1,1,2,5587,3 -506,alias_default_68,call_function,alias.default,forward,2,1,1,4,3,5586,3 -507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 -508,view_as_real_4,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6 -509,view_58,call_function,view.default,forward,2,1,1,1,256,5573,6 -510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 -511,view_as_real_5,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6 -512,view_59,call_function,view.default,forward,2,1,1,1,256,5573,6 -513,convert_element_type_58,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6 -514,convert_element_type_59,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6 -515,permute_25,call_function,permute.default,forward,2,1,1,1,258,5571,6 -516,permute_26,call_function,permute.default,forward,2,1,1,1,258,5571,6 -517,permute_27,call_function,permute.default,forward,2,1,1,1,249,5571,4 -518,alias_default_69,call_function,alias.default,forward,2,1,1,2,259,5570,4 -519,alias_default_70,call_function,alias.default,forward,2,1,1,2,259,5570,4 -520,alias_default_71,call_function,alias.default,forward,2,1,1,2,250,5570,4 -521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2 -522,getitem_18,call_function,getitem,forward,2,1,1,1,284,5565,2 -523,getitem_19,call_function,getitem,forward,2,1,1,1,284,284,2 -524,getitem_24,call_function,getitem,forward,2,1,1,1,284,284,1 -525,getitem_25,call_function,getitem,forward,2,1,1,1,284,284,1 -526,alias_default_72,call_function,alias.default,forward,2,1,1,2,285,5564,4 -527,permute_28,call_function,permute.default,forward,2,1,1,1,286,5563,4 -528,view_60,call_function,view.default,forward,2,1,1,1,287,5562,3 -529,dtype_cast_23,call_function,dtype_cast.default,forward,2,1,1,1,1,5564,3 -530,permute_29,call_function,permute.default,forward,2,1,1,1,2,5563,3 -531,alias_default_73,call_function,alias.default,forward,2,1,1,2,288,5561,4 -532,alias_default_74,call_function,alias.default,forward,2,1,1,2,3,5562,3 -533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5 -534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10 -535,dtype_cast_24,call_function,dtype_cast.default,forward,2,1,1,1,1,5548,2 -536,alias_default_75,call_function,alias.default,forward,2,1,1,3,295,5558,4 -537,convert_element_type_62,call_function,convert_element_type.default,forward,2,1,1,1,296,5556,4 -538,alias_default_77,call_function,alias.default,forward,2,1,1,2,297,5555,4 -539,pow_6,call_function,pow.Tensor_Scalar,forward,2,1,1,1,298,5554,4 -540,mean_5,call_function,mean.dim,forward,2,1,1,1,299,5553,4 -541,add_12,call_function,add.Scalar,forward,2,1,1,1,300,5552,3 -542,rsqrt_5,call_function,rsqrt.default,forward,2,1,1,1,301,5551,3 -543,alias_default_78,call_function,alias.default,forward,2,1,1,3,302,5550,3 -544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8 -545,alias_default_76,call_function,alias.default,forward,2,1,1,2,2,5547,2 -546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8 -547,convert_element_type_63,call_function,convert_element_type.default,forward,2,1,1,1,308,5544,6 -548,dtype_cast_25,call_function,dtype_cast.default,forward,2,1,1,1,1,5544,3 -549,permute_30,call_function,permute.default,forward,2,1,1,1,2,5543,3 -550,alias_default_79,call_function,alias.default,forward,2,1,1,4,309,5543,4 -551,alias_default_80,call_function,alias.default,forward,2,1,1,2,3,5542,3 -552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5 -553,alias_default_81,call_function,alias.default,forward,2,1,1,2,315,5539,4 -554,convert_element_type_66,call_function,convert_element_type.default,forward,2,1,1,1,316,5527,4 -555,alias_default_82,call_function,alias.default,forward,2,1,1,2,317,5526,4 -556,neg_2,call_function,neg.default,forward,2,1,1,1,318,5525,8 -557,exp_2,call_function,exp.default,forward,2,1,1,1,319,5524,6 -558,add_13,call_function,add.Tensor,forward,2,1,1,1,320,5523,4 -559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6 -560,convert_element_type_67,call_function,convert_element_type.default,forward,2,1,1,1,322,5521,6 -561,dtype_cast_26,call_function,dtype_cast.default,forward,2,1,1,1,1,5525,3 -562,permute_31,call_function,permute.default,forward,2,1,1,1,2,5524,3 -563,alias_default_84,call_function,alias.default,forward,2,1,1,2,3,5523,3 -564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5 -565,alias_default_83,call_function,alias.default,forward,2,1,1,2,323,5520,4 -566,alias_default_85,call_function,alias.default,forward,2,1,1,2,315,5520,4 -567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8 -568,dtype_cast_27,call_function,dtype_cast.default,forward,2,1,1,1,1,5521,3 -569,permute_32,call_function,permute.default,forward,2,1,1,1,2,5520,3 -570,alias_default_86,call_function,alias.default,forward,2,1,1,2,331,5518,4 -571,alias_default_87,call_function,alias.default,forward,2,1,1,2,3,5519,3 -572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5 -573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10 -574,dtype_cast_28,call_function,dtype_cast.default,forward,3,1,1,1,1,5505,2 -575,alias_default_88,call_function,alias.default,forward,2,1,1,3,338,5515,4 -576,convert_element_type_72,call_function,convert_element_type.default,forward,3,1,1,1,339,5513,4 -577,alias_default_90,call_function,alias.default,forward,3,1,1,2,340,5512,4 -578,pow_7,call_function,pow.Tensor_Scalar,forward,3,1,1,1,341,5511,4 -579,mean_6,call_function,mean.dim,forward,3,1,1,1,342,5510,4 -580,add_15,call_function,add.Scalar,forward,3,1,1,1,343,5509,3 -581,rsqrt_6,call_function,rsqrt.default,forward,3,1,1,1,344,5508,3 -582,alias_default_91,call_function,alias.default,forward,3,1,1,3,345,5507,3 -583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8 -584,alias_default_89,call_function,alias.default,forward,3,1,1,2,2,5504,2 -585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8 -586,convert_element_type_73,call_function,convert_element_type.default,forward,3,1,1,1,351,5501,6 -587,dtype_cast_29,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3 -588,permute_33,call_function,permute.default,forward,3,1,1,1,2,5487,3 -589,alias_default_92,call_function,alias.default,forward,3,1,1,6,352,5500,4 -590,alias_default_93,call_function,alias.default,forward,3,1,1,2,3,5486,3 -591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5 -592,dtype_cast_30,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3 -593,permute_34,call_function,permute.default,forward,3,1,1,1,2,5487,3 -594,alias_default_94,call_function,alias.default,forward,3,1,1,2,3,5486,3 -595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5 -596,dtype_cast_31,call_function,dtype_cast.default,forward,3,1,1,1,1,5481,3 -597,permute_35,call_function,permute.default,forward,3,1,1,1,2,5480,3 -598,alias_default_95,call_function,alias.default,forward,3,1,1,2,3,5479,3 -599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5 -600,view_75,call_function,view.default,forward,3,1,1,1,358,5483,4 -601,view_76,call_function,view.default,forward,3,1,1,1,358,5483,4 -602,view_77,call_function,view.default,forward,3,1,1,1,358,5476,4 -603,convert_element_type_80,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4 -604,view_78,call_function,view.default,forward,3,1,1,1,360,5481,4 -605,view_as_complex_6,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6 -606,convert_element_type_81,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4 -607,view_79,call_function,view.default,forward,3,1,1,1,360,5481,4 -608,view_as_complex_7,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6 -609,view_80,call_function,view.default,forward,3,1,1,1,2,5491,3 -610,alias_default_96,call_function,alias.default,forward,3,1,1,4,3,5490,3 -611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 -612,view_as_real_6,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6 -613,view_81,call_function,view.default,forward,3,1,1,1,366,5477,6 -614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 -615,view_as_real_7,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6 -616,view_82,call_function,view.default,forward,3,1,1,1,366,5477,6 -617,convert_element_type_82,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6 -618,convert_element_type_83,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6 -619,permute_36,call_function,permute.default,forward,3,1,1,1,368,5475,6 -620,permute_37,call_function,permute.default,forward,3,1,1,1,368,5475,6 -621,permute_38,call_function,permute.default,forward,3,1,1,1,359,5475,4 -622,alias_default_97,call_function,alias.default,forward,3,1,1,2,369,5474,4 -623,alias_default_98,call_function,alias.default,forward,3,1,1,2,369,5474,4 -624,alias_default_99,call_function,alias.default,forward,3,1,1,2,360,5474,4 -625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2 -626,getitem_27,call_function,getitem,forward,3,1,1,1,394,5469,2 -627,getitem_28,call_function,getitem,forward,3,1,1,1,394,394,2 -628,getitem_33,call_function,getitem,forward,3,1,1,1,394,394,1 -629,getitem_34,call_function,getitem,forward,3,1,1,1,394,394,1 -630,alias_default_100,call_function,alias.default,forward,3,1,1,2,395,5468,4 -631,permute_39,call_function,permute.default,forward,3,1,1,1,396,5467,4 -632,view_83,call_function,view.default,forward,3,1,1,1,397,5466,3 -633,dtype_cast_32,call_function,dtype_cast.default,forward,3,1,1,1,1,5468,3 -634,permute_40,call_function,permute.default,forward,3,1,1,1,2,5467,3 -635,alias_default_101,call_function,alias.default,forward,3,1,1,2,398,5465,4 -636,alias_default_102,call_function,alias.default,forward,3,1,1,2,3,5466,3 -637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5 -638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10 -639,dtype_cast_33,call_function,dtype_cast.default,forward,3,1,1,1,1,5452,2 -640,alias_default_103,call_function,alias.default,forward,3,1,1,3,405,5462,4 -641,convert_element_type_86,call_function,convert_element_type.default,forward,3,1,1,1,406,5460,4 -642,alias_default_105,call_function,alias.default,forward,3,1,1,2,407,5459,4 -643,pow_8,call_function,pow.Tensor_Scalar,forward,3,1,1,1,408,5458,4 -644,mean_7,call_function,mean.dim,forward,3,1,1,1,409,5457,4 -645,add_17,call_function,add.Scalar,forward,3,1,1,1,410,5456,3 -646,rsqrt_7,call_function,rsqrt.default,forward,3,1,1,1,411,5455,3 -647,alias_default_106,call_function,alias.default,forward,3,1,1,3,412,5454,3 -648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8 -649,alias_default_104,call_function,alias.default,forward,3,1,1,2,2,5451,2 -650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8 -651,convert_element_type_87,call_function,convert_element_type.default,forward,3,1,1,1,418,5448,6 -652,dtype_cast_34,call_function,dtype_cast.default,forward,3,1,1,1,1,5448,3 -653,permute_41,call_function,permute.default,forward,3,1,1,1,2,5447,3 -654,alias_default_107,call_function,alias.default,forward,3,1,1,4,419,5447,4 -655,alias_default_108,call_function,alias.default,forward,3,1,1,2,3,5446,3 -656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5 -657,alias_default_109,call_function,alias.default,forward,3,1,1,2,425,5443,4 -658,convert_element_type_90,call_function,convert_element_type.default,forward,3,1,1,1,426,5431,4 -659,alias_default_110,call_function,alias.default,forward,3,1,1,2,427,5430,4 -660,neg_3,call_function,neg.default,forward,3,1,1,1,428,5429,8 -661,exp_3,call_function,exp.default,forward,3,1,1,1,429,5428,6 -662,add_18,call_function,add.Tensor,forward,3,1,1,1,430,5427,4 -663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6 -664,convert_element_type_91,call_function,convert_element_type.default,forward,3,1,1,1,432,5425,6 -665,dtype_cast_35,call_function,dtype_cast.default,forward,3,1,1,1,1,5429,3 -666,permute_42,call_function,permute.default,forward,3,1,1,1,2,5428,3 -667,alias_default_112,call_function,alias.default,forward,3,1,1,2,3,5427,3 -668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5 -669,alias_default_111,call_function,alias.default,forward,3,1,1,2,433,5424,4 -670,alias_default_113,call_function,alias.default,forward,3,1,1,2,425,5424,4 -671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8 -672,dtype_cast_36,call_function,dtype_cast.default,forward,3,1,1,1,1,5425,3 -673,permute_43,call_function,permute.default,forward,3,1,1,1,2,5424,3 -674,alias_default_114,call_function,alias.default,forward,3,1,1,2,441,5422,4 -675,alias_default_115,call_function,alias.default,forward,3,1,1,2,3,5423,3 -676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5 -677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10 -678,dtype_cast_37,call_function,dtype_cast.default,forward,4,1,1,1,1,5409,2 -679,alias_default_116,call_function,alias.default,forward,3,1,1,3,448,5419,4 -680,convert_element_type_96,call_function,convert_element_type.default,forward,4,1,1,1,449,5417,4 -681,alias_default_118,call_function,alias.default,forward,4,1,1,2,450,5416,4 -682,pow_9,call_function,pow.Tensor_Scalar,forward,4,1,1,1,451,5415,4 -683,mean_8,call_function,mean.dim,forward,4,1,1,1,452,5414,4 -684,add_20,call_function,add.Scalar,forward,4,1,1,1,453,5413,3 -685,rsqrt_8,call_function,rsqrt.default,forward,4,1,1,1,454,5412,3 -686,alias_default_119,call_function,alias.default,forward,4,1,1,3,455,5411,3 -687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8 -688,alias_default_117,call_function,alias.default,forward,4,1,1,2,2,5408,2 -689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8 -690,convert_element_type_97,call_function,convert_element_type.default,forward,4,1,1,1,461,5405,6 -691,dtype_cast_38,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3 -692,permute_44,call_function,permute.default,forward,4,1,1,1,2,5391,3 -693,alias_default_120,call_function,alias.default,forward,4,1,1,6,462,5404,4 -694,alias_default_121,call_function,alias.default,forward,4,1,1,2,3,5390,3 -695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5 -696,dtype_cast_39,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3 -697,permute_45,call_function,permute.default,forward,4,1,1,1,2,5391,3 -698,alias_default_122,call_function,alias.default,forward,4,1,1,2,3,5390,3 -699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5 -700,dtype_cast_40,call_function,dtype_cast.default,forward,4,1,1,1,1,5385,3 -701,permute_46,call_function,permute.default,forward,4,1,1,1,2,5384,3 -702,alias_default_123,call_function,alias.default,forward,4,1,1,2,3,5383,3 -703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5 -704,view_98,call_function,view.default,forward,4,1,1,1,468,5387,4 -705,view_99,call_function,view.default,forward,4,1,1,1,468,5387,4 -706,view_100,call_function,view.default,forward,4,1,1,1,468,5380,4 -707,convert_element_type_104,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4 -708,view_101,call_function,view.default,forward,4,1,1,1,470,5385,4 -709,view_as_complex_8,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6 -710,convert_element_type_105,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4 -711,view_102,call_function,view.default,forward,4,1,1,1,470,5385,4 -712,view_as_complex_9,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6 -713,view_103,call_function,view.default,forward,4,1,1,1,2,5395,3 -714,alias_default_124,call_function,alias.default,forward,4,1,1,4,3,5394,3 -715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 -716,view_as_real_8,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6 -717,view_104,call_function,view.default,forward,4,1,1,1,476,5381,6 -718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 -719,view_as_real_9,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6 -720,view_105,call_function,view.default,forward,4,1,1,1,476,5381,6 -721,convert_element_type_106,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6 -722,convert_element_type_107,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6 -723,permute_47,call_function,permute.default,forward,4,1,1,1,478,5379,6 -724,permute_48,call_function,permute.default,forward,4,1,1,1,478,5379,6 -725,permute_49,call_function,permute.default,forward,4,1,1,1,469,5379,4 -726,alias_default_125,call_function,alias.default,forward,4,1,1,2,479,5378,4 -727,alias_default_126,call_function,alias.default,forward,4,1,1,2,479,5378,4 -728,alias_default_127,call_function,alias.default,forward,4,1,1,2,470,5378,4 -729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2 -730,getitem_36,call_function,getitem,forward,4,1,1,1,504,5373,2 -731,getitem_37,call_function,getitem,forward,4,1,1,1,504,504,2 -732,getitem_42,call_function,getitem,forward,4,1,1,1,504,504,1 -733,getitem_43,call_function,getitem,forward,4,1,1,1,504,504,1 -734,alias_default_128,call_function,alias.default,forward,4,1,1,2,505,5372,4 -735,permute_50,call_function,permute.default,forward,4,1,1,1,506,5371,4 -736,view_106,call_function,view.default,forward,4,1,1,1,507,5370,3 -737,dtype_cast_41,call_function,dtype_cast.default,forward,4,1,1,1,1,5372,3 -738,permute_51,call_function,permute.default,forward,4,1,1,1,2,5371,3 -739,alias_default_129,call_function,alias.default,forward,4,1,1,2,508,5369,4 -740,alias_default_130,call_function,alias.default,forward,4,1,1,2,3,5370,3 -741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5 -742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10 -743,dtype_cast_42,call_function,dtype_cast.default,forward,4,1,1,1,1,5356,2 -744,alias_default_131,call_function,alias.default,forward,4,1,1,3,515,5366,4 -745,convert_element_type_110,call_function,convert_element_type.default,forward,4,1,1,1,516,5364,4 -746,alias_default_133,call_function,alias.default,forward,4,1,1,2,517,5363,4 -747,pow_10,call_function,pow.Tensor_Scalar,forward,4,1,1,1,518,5362,4 -748,mean_9,call_function,mean.dim,forward,4,1,1,1,519,5361,4 -749,add_22,call_function,add.Scalar,forward,4,1,1,1,520,5360,3 -750,rsqrt_9,call_function,rsqrt.default,forward,4,1,1,1,521,5359,3 -751,alias_default_134,call_function,alias.default,forward,4,1,1,3,522,5358,3 -752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8 -753,alias_default_132,call_function,alias.default,forward,4,1,1,2,2,5355,2 -754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8 -755,convert_element_type_111,call_function,convert_element_type.default,forward,4,1,1,1,528,5352,6 -756,dtype_cast_43,call_function,dtype_cast.default,forward,4,1,1,1,1,5352,3 -757,permute_52,call_function,permute.default,forward,4,1,1,1,2,5351,3 -758,alias_default_135,call_function,alias.default,forward,4,1,1,4,529,5351,4 -759,alias_default_136,call_function,alias.default,forward,4,1,1,2,3,5350,3 -760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5 -761,alias_default_137,call_function,alias.default,forward,4,1,1,2,535,5347,4 -762,convert_element_type_114,call_function,convert_element_type.default,forward,4,1,1,1,536,5335,4 -763,alias_default_138,call_function,alias.default,forward,4,1,1,2,537,5334,4 -764,neg_4,call_function,neg.default,forward,4,1,1,1,538,5333,8 -765,exp_4,call_function,exp.default,forward,4,1,1,1,539,5332,6 -766,add_23,call_function,add.Tensor,forward,4,1,1,1,540,5331,4 -767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6 -768,convert_element_type_115,call_function,convert_element_type.default,forward,4,1,1,1,542,5329,6 -769,dtype_cast_44,call_function,dtype_cast.default,forward,4,1,1,1,1,5333,3 -770,permute_53,call_function,permute.default,forward,4,1,1,1,2,5332,3 -771,alias_default_140,call_function,alias.default,forward,4,1,1,2,3,5331,3 -772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5 -773,alias_default_139,call_function,alias.default,forward,4,1,1,2,543,5328,4 -774,alias_default_141,call_function,alias.default,forward,4,1,1,2,535,5328,4 -775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8 -776,dtype_cast_45,call_function,dtype_cast.default,forward,4,1,1,1,1,5329,3 -777,permute_54,call_function,permute.default,forward,4,1,1,1,2,5328,3 -778,alias_default_142,call_function,alias.default,forward,4,1,1,2,551,5326,4 -779,alias_default_143,call_function,alias.default,forward,4,1,1,2,3,5327,3 -780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5 -781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10 -782,dtype_cast_46,call_function,dtype_cast.default,forward,5,1,1,1,1,5313,2 -783,alias_default_144,call_function,alias.default,forward,4,1,1,3,558,5323,4 -784,convert_element_type_120,call_function,convert_element_type.default,forward,5,1,1,1,559,5321,4 -785,alias_default_146,call_function,alias.default,forward,5,1,1,2,560,5320,4 -786,pow_11,call_function,pow.Tensor_Scalar,forward,5,1,1,1,561,5319,4 -787,mean_10,call_function,mean.dim,forward,5,1,1,1,562,5318,4 -788,add_25,call_function,add.Scalar,forward,5,1,1,1,563,5317,3 -789,rsqrt_10,call_function,rsqrt.default,forward,5,1,1,1,564,5316,3 -790,alias_default_147,call_function,alias.default,forward,5,1,1,3,565,5315,3 -791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8 -792,alias_default_145,call_function,alias.default,forward,5,1,1,2,2,5312,2 -793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8 -794,convert_element_type_121,call_function,convert_element_type.default,forward,5,1,1,1,571,5309,6 -795,dtype_cast_47,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3 -796,permute_55,call_function,permute.default,forward,5,1,1,1,2,5295,3 -797,alias_default_148,call_function,alias.default,forward,5,1,1,6,572,5308,4 -798,alias_default_149,call_function,alias.default,forward,5,1,1,2,3,5294,3 -799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5 -800,dtype_cast_48,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3 -801,permute_56,call_function,permute.default,forward,5,1,1,1,2,5295,3 -802,alias_default_150,call_function,alias.default,forward,5,1,1,2,3,5294,3 -803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5 -804,dtype_cast_49,call_function,dtype_cast.default,forward,5,1,1,1,1,5289,3 -805,permute_57,call_function,permute.default,forward,5,1,1,1,2,5288,3 -806,alias_default_151,call_function,alias.default,forward,5,1,1,2,3,5287,3 -807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5 -808,view_121,call_function,view.default,forward,5,1,1,1,578,5291,4 -809,view_122,call_function,view.default,forward,5,1,1,1,578,5291,4 -810,view_123,call_function,view.default,forward,5,1,1,1,578,5284,4 -811,convert_element_type_128,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4 -812,view_124,call_function,view.default,forward,5,1,1,1,580,5289,4 -813,view_as_complex_10,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6 -814,convert_element_type_129,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4 -815,view_125,call_function,view.default,forward,5,1,1,1,580,5289,4 -816,view_as_complex_11,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6 -817,view_126,call_function,view.default,forward,5,1,1,1,2,5299,3 -818,alias_default_152,call_function,alias.default,forward,5,1,1,4,3,5298,3 -819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 -820,view_as_real_10,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6 -821,view_127,call_function,view.default,forward,5,1,1,1,586,5285,6 -822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 -823,view_as_real_11,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6 -824,view_128,call_function,view.default,forward,5,1,1,1,586,5285,6 -825,convert_element_type_130,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6 -826,convert_element_type_131,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6 -827,permute_58,call_function,permute.default,forward,5,1,1,1,588,5283,6 -828,permute_59,call_function,permute.default,forward,5,1,1,1,588,5283,6 -829,permute_60,call_function,permute.default,forward,5,1,1,1,579,5283,4 -830,alias_default_153,call_function,alias.default,forward,5,1,1,2,589,5282,4 -831,alias_default_154,call_function,alias.default,forward,5,1,1,2,589,5282,4 -832,alias_default_155,call_function,alias.default,forward,5,1,1,2,580,5282,4 -833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2 -834,getitem_45,call_function,getitem,forward,5,1,1,1,614,5277,2 -835,getitem_46,call_function,getitem,forward,5,1,1,1,614,614,2 -836,getitem_51,call_function,getitem,forward,5,1,1,1,614,614,1 -837,getitem_52,call_function,getitem,forward,5,1,1,1,614,614,1 -838,alias_default_156,call_function,alias.default,forward,5,1,1,2,615,5276,4 -839,permute_61,call_function,permute.default,forward,5,1,1,1,616,5275,4 -840,view_129,call_function,view.default,forward,5,1,1,1,617,5274,3 -841,dtype_cast_50,call_function,dtype_cast.default,forward,5,1,1,1,1,5276,3 -842,permute_62,call_function,permute.default,forward,5,1,1,1,2,5275,3 -843,alias_default_157,call_function,alias.default,forward,5,1,1,2,618,5273,4 -844,alias_default_158,call_function,alias.default,forward,5,1,1,2,3,5274,3 -845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5 -846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10 -847,dtype_cast_51,call_function,dtype_cast.default,forward,5,1,1,1,1,5260,2 -848,alias_default_159,call_function,alias.default,forward,5,1,1,3,625,5270,4 -849,convert_element_type_134,call_function,convert_element_type.default,forward,5,1,1,1,626,5268,4 -850,alias_default_161,call_function,alias.default,forward,5,1,1,2,627,5267,4 -851,pow_12,call_function,pow.Tensor_Scalar,forward,5,1,1,1,628,5266,4 -852,mean_11,call_function,mean.dim,forward,5,1,1,1,629,5265,4 -853,add_27,call_function,add.Scalar,forward,5,1,1,1,630,5264,3 -854,rsqrt_11,call_function,rsqrt.default,forward,5,1,1,1,631,5263,3 -855,alias_default_162,call_function,alias.default,forward,5,1,1,3,632,5262,3 -856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8 -857,alias_default_160,call_function,alias.default,forward,5,1,1,2,2,5259,2 -858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8 -859,convert_element_type_135,call_function,convert_element_type.default,forward,5,1,1,1,638,5256,6 -860,dtype_cast_52,call_function,dtype_cast.default,forward,5,1,1,1,1,5256,3 -861,permute_63,call_function,permute.default,forward,5,1,1,1,2,5255,3 -862,alias_default_163,call_function,alias.default,forward,5,1,1,4,639,5255,4 -863,alias_default_164,call_function,alias.default,forward,5,1,1,2,3,5254,3 -864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5 -865,alias_default_165,call_function,alias.default,forward,5,1,1,2,645,5251,4 -866,convert_element_type_138,call_function,convert_element_type.default,forward,5,1,1,1,646,5239,4 -867,alias_default_166,call_function,alias.default,forward,5,1,1,2,647,5238,4 -868,neg_5,call_function,neg.default,forward,5,1,1,1,648,5237,8 -869,exp_5,call_function,exp.default,forward,5,1,1,1,649,5236,6 -870,add_28,call_function,add.Tensor,forward,5,1,1,1,650,5235,4 -871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6 -872,convert_element_type_139,call_function,convert_element_type.default,forward,5,1,1,1,652,5233,6 -873,dtype_cast_53,call_function,dtype_cast.default,forward,5,1,1,1,1,5237,3 -874,permute_64,call_function,permute.default,forward,5,1,1,1,2,5236,3 -875,alias_default_168,call_function,alias.default,forward,5,1,1,2,3,5235,3 -876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5 -877,alias_default_167,call_function,alias.default,forward,5,1,1,2,653,5232,4 -878,alias_default_169,call_function,alias.default,forward,5,1,1,2,645,5232,4 -879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8 -880,dtype_cast_54,call_function,dtype_cast.default,forward,5,1,1,1,1,5233,3 -881,permute_65,call_function,permute.default,forward,5,1,1,1,2,5232,3 -882,alias_default_170,call_function,alias.default,forward,5,1,1,2,661,5230,4 -883,alias_default_171,call_function,alias.default,forward,5,1,1,2,3,5231,3 -884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5 -885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10 -886,dtype_cast_55,call_function,dtype_cast.default,forward,6,1,1,1,1,5217,2 -887,alias_default_172,call_function,alias.default,forward,5,1,1,3,668,5227,4 -888,convert_element_type_144,call_function,convert_element_type.default,forward,6,1,1,1,669,5225,4 -889,alias_default_174,call_function,alias.default,forward,6,1,1,2,670,5224,4 -890,pow_13,call_function,pow.Tensor_Scalar,forward,6,1,1,1,671,5223,4 -891,mean_12,call_function,mean.dim,forward,6,1,1,1,672,5222,4 -892,add_30,call_function,add.Scalar,forward,6,1,1,1,673,5221,3 -893,rsqrt_12,call_function,rsqrt.default,forward,6,1,1,1,674,5220,3 -894,alias_default_175,call_function,alias.default,forward,6,1,1,3,675,5219,3 -895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8 -896,alias_default_173,call_function,alias.default,forward,6,1,1,2,2,5216,2 -897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8 -898,convert_element_type_145,call_function,convert_element_type.default,forward,6,1,1,1,681,5213,6 -899,dtype_cast_56,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3 -900,permute_66,call_function,permute.default,forward,6,1,1,1,2,5199,3 -901,alias_default_176,call_function,alias.default,forward,6,1,1,6,682,5212,4 -902,alias_default_177,call_function,alias.default,forward,6,1,1,2,3,5198,3 -903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5 -904,dtype_cast_57,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3 -905,permute_67,call_function,permute.default,forward,6,1,1,1,2,5199,3 -906,alias_default_178,call_function,alias.default,forward,6,1,1,2,3,5198,3 -907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5 -908,dtype_cast_58,call_function,dtype_cast.default,forward,6,1,1,1,1,5193,3 -909,permute_68,call_function,permute.default,forward,6,1,1,1,2,5192,3 -910,alias_default_179,call_function,alias.default,forward,6,1,1,2,3,5191,3 -911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5 -912,view_144,call_function,view.default,forward,6,1,1,1,688,5195,4 -913,view_145,call_function,view.default,forward,6,1,1,1,688,5195,4 -914,view_146,call_function,view.default,forward,6,1,1,1,688,5188,4 -915,convert_element_type_152,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4 -916,view_147,call_function,view.default,forward,6,1,1,1,690,5193,4 -917,view_as_complex_12,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6 -918,convert_element_type_153,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4 -919,view_148,call_function,view.default,forward,6,1,1,1,690,5193,4 -920,view_as_complex_13,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6 -921,view_149,call_function,view.default,forward,6,1,1,1,2,5203,3 -922,alias_default_180,call_function,alias.default,forward,6,1,1,4,3,5202,3 -923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 -924,view_as_real_12,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6 -925,view_150,call_function,view.default,forward,6,1,1,1,696,5189,6 -926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 -927,view_as_real_13,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6 -928,view_151,call_function,view.default,forward,6,1,1,1,696,5189,6 -929,convert_element_type_154,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6 -930,convert_element_type_155,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6 -931,permute_69,call_function,permute.default,forward,6,1,1,1,698,5187,6 -932,permute_70,call_function,permute.default,forward,6,1,1,1,698,5187,6 -933,permute_71,call_function,permute.default,forward,6,1,1,1,689,5187,4 -934,alias_default_181,call_function,alias.default,forward,6,1,1,2,699,5186,4 -935,alias_default_182,call_function,alias.default,forward,6,1,1,2,699,5186,4 -936,alias_default_183,call_function,alias.default,forward,6,1,1,2,690,5186,4 -937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2 -938,getitem_54,call_function,getitem,forward,6,1,1,1,724,5181,2 -939,getitem_55,call_function,getitem,forward,6,1,1,1,724,724,2 -940,getitem_60,call_function,getitem,forward,6,1,1,1,724,724,1 -941,getitem_61,call_function,getitem,forward,6,1,1,1,724,724,1 -942,alias_default_184,call_function,alias.default,forward,6,1,1,2,725,5180,4 -943,permute_72,call_function,permute.default,forward,6,1,1,1,726,5179,4 -944,view_152,call_function,view.default,forward,6,1,1,1,727,5178,3 -945,dtype_cast_59,call_function,dtype_cast.default,forward,6,1,1,1,1,5180,3 -946,permute_73,call_function,permute.default,forward,6,1,1,1,2,5179,3 -947,alias_default_185,call_function,alias.default,forward,6,1,1,2,728,5177,4 -948,alias_default_186,call_function,alias.default,forward,6,1,1,2,3,5178,3 -949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5 -950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10 -951,dtype_cast_60,call_function,dtype_cast.default,forward,6,1,1,1,1,5164,2 -952,alias_default_187,call_function,alias.default,forward,6,1,1,3,735,5174,4 -953,convert_element_type_158,call_function,convert_element_type.default,forward,6,1,1,1,736,5172,4 -954,alias_default_189,call_function,alias.default,forward,6,1,1,2,737,5171,4 -955,pow_14,call_function,pow.Tensor_Scalar,forward,6,1,1,1,738,5170,4 -956,mean_13,call_function,mean.dim,forward,6,1,1,1,739,5169,4 -957,add_32,call_function,add.Scalar,forward,6,1,1,1,740,5168,3 -958,rsqrt_13,call_function,rsqrt.default,forward,6,1,1,1,741,5167,3 -959,alias_default_190,call_function,alias.default,forward,6,1,1,3,742,5166,3 -960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8 -961,alias_default_188,call_function,alias.default,forward,6,1,1,2,2,5163,2 -962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8 -963,convert_element_type_159,call_function,convert_element_type.default,forward,6,1,1,1,748,5160,6 -964,dtype_cast_61,call_function,dtype_cast.default,forward,6,1,1,1,1,5160,3 -965,permute_74,call_function,permute.default,forward,6,1,1,1,2,5159,3 -966,alias_default_191,call_function,alias.default,forward,6,1,1,4,749,5159,4 -967,alias_default_192,call_function,alias.default,forward,6,1,1,2,3,5158,3 -968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5 -969,alias_default_193,call_function,alias.default,forward,6,1,1,2,755,5155,4 -970,convert_element_type_162,call_function,convert_element_type.default,forward,6,1,1,1,756,5143,4 -971,alias_default_194,call_function,alias.default,forward,6,1,1,2,757,5142,4 -972,neg_6,call_function,neg.default,forward,6,1,1,1,758,5141,8 -973,exp_6,call_function,exp.default,forward,6,1,1,1,759,5140,6 -974,add_33,call_function,add.Tensor,forward,6,1,1,1,760,5139,4 -975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6 -976,convert_element_type_163,call_function,convert_element_type.default,forward,6,1,1,1,762,5137,6 -977,dtype_cast_62,call_function,dtype_cast.default,forward,6,1,1,1,1,5141,3 -978,permute_75,call_function,permute.default,forward,6,1,1,1,2,5140,3 -979,alias_default_196,call_function,alias.default,forward,6,1,1,2,3,5139,3 -980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5 -981,alias_default_195,call_function,alias.default,forward,6,1,1,2,763,5136,4 -982,alias_default_197,call_function,alias.default,forward,6,1,1,2,755,5136,4 -983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8 -984,dtype_cast_63,call_function,dtype_cast.default,forward,6,1,1,1,1,5137,3 -985,permute_76,call_function,permute.default,forward,6,1,1,1,2,5136,3 -986,alias_default_198,call_function,alias.default,forward,6,1,1,2,771,5134,4 -987,alias_default_199,call_function,alias.default,forward,6,1,1,2,3,5135,3 -988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5 -989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10 -990,dtype_cast_64,call_function,dtype_cast.default,forward,7,1,1,1,1,5121,2 -991,alias_default_200,call_function,alias.default,forward,6,1,1,3,778,5131,4 -992,convert_element_type_168,call_function,convert_element_type.default,forward,7,1,1,1,779,5129,4 -993,alias_default_202,call_function,alias.default,forward,7,1,1,2,780,5128,4 -994,pow_15,call_function,pow.Tensor_Scalar,forward,7,1,1,1,781,5127,4 -995,mean_14,call_function,mean.dim,forward,7,1,1,1,782,5126,4 -996,add_35,call_function,add.Scalar,forward,7,1,1,1,783,5125,3 -997,rsqrt_14,call_function,rsqrt.default,forward,7,1,1,1,784,5124,3 -998,alias_default_203,call_function,alias.default,forward,7,1,1,3,785,5123,3 -999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8 -1000,alias_default_201,call_function,alias.default,forward,7,1,1,2,2,5120,2 -1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8 -1002,convert_element_type_169,call_function,convert_element_type.default,forward,7,1,1,1,791,5117,6 -1003,dtype_cast_65,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3 -1004,permute_77,call_function,permute.default,forward,7,1,1,1,2,5103,3 -1005,alias_default_204,call_function,alias.default,forward,7,1,1,6,792,5116,4 -1006,alias_default_205,call_function,alias.default,forward,7,1,1,2,3,5102,3 -1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5 -1008,dtype_cast_66,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3 -1009,permute_78,call_function,permute.default,forward,7,1,1,1,2,5103,3 -1010,alias_default_206,call_function,alias.default,forward,7,1,1,2,3,5102,3 -1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5 -1012,dtype_cast_67,call_function,dtype_cast.default,forward,7,1,1,1,1,5097,3 -1013,permute_79,call_function,permute.default,forward,7,1,1,1,2,5096,3 -1014,alias_default_207,call_function,alias.default,forward,7,1,1,2,3,5095,3 -1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5 -1016,view_167,call_function,view.default,forward,7,1,1,1,798,5099,4 -1017,view_168,call_function,view.default,forward,7,1,1,1,798,5099,4 -1018,view_169,call_function,view.default,forward,7,1,1,1,798,5092,4 -1019,convert_element_type_176,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4 -1020,view_170,call_function,view.default,forward,7,1,1,1,800,5097,4 -1021,view_as_complex_14,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6 -1022,convert_element_type_177,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4 -1023,view_171,call_function,view.default,forward,7,1,1,1,800,5097,4 -1024,view_as_complex_15,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6 -1025,view_172,call_function,view.default,forward,7,1,1,1,2,5107,3 -1026,alias_default_208,call_function,alias.default,forward,7,1,1,4,3,5106,3 -1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 -1028,view_as_real_14,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6 -1029,view_173,call_function,view.default,forward,7,1,1,1,806,5093,6 -1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 -1031,view_as_real_15,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6 -1032,view_174,call_function,view.default,forward,7,1,1,1,806,5093,6 -1033,convert_element_type_178,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6 -1034,convert_element_type_179,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6 -1035,permute_80,call_function,permute.default,forward,7,1,1,1,808,5091,6 -1036,permute_81,call_function,permute.default,forward,7,1,1,1,808,5091,6 -1037,permute_82,call_function,permute.default,forward,7,1,1,1,799,5091,4 -1038,alias_default_209,call_function,alias.default,forward,7,1,1,2,809,5090,4 -1039,alias_default_210,call_function,alias.default,forward,7,1,1,2,809,5090,4 -1040,alias_default_211,call_function,alias.default,forward,7,1,1,2,800,5090,4 -1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2 -1042,getitem_63,call_function,getitem,forward,7,1,1,1,834,5085,2 -1043,getitem_64,call_function,getitem,forward,7,1,1,1,834,834,2 -1044,getitem_69,call_function,getitem,forward,7,1,1,1,834,834,1 -1045,getitem_70,call_function,getitem,forward,7,1,1,1,834,834,1 -1046,alias_default_212,call_function,alias.default,forward,7,1,1,2,835,5084,4 -1047,permute_83,call_function,permute.default,forward,7,1,1,1,836,5083,4 -1048,view_175,call_function,view.default,forward,7,1,1,1,837,5082,3 -1049,dtype_cast_68,call_function,dtype_cast.default,forward,7,1,1,1,1,5084,3 -1050,permute_84,call_function,permute.default,forward,7,1,1,1,2,5083,3 -1051,alias_default_213,call_function,alias.default,forward,7,1,1,2,838,5081,4 -1052,alias_default_214,call_function,alias.default,forward,7,1,1,2,3,5082,3 -1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5 -1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10 -1055,dtype_cast_69,call_function,dtype_cast.default,forward,7,1,1,1,1,5068,2 -1056,alias_default_215,call_function,alias.default,forward,7,1,1,3,845,5078,4 -1057,convert_element_type_182,call_function,convert_element_type.default,forward,7,1,1,1,846,5076,4 -1058,alias_default_217,call_function,alias.default,forward,7,1,1,2,847,5075,4 -1059,pow_16,call_function,pow.Tensor_Scalar,forward,7,1,1,1,848,5074,4 -1060,mean_15,call_function,mean.dim,forward,7,1,1,1,849,5073,4 -1061,add_37,call_function,add.Scalar,forward,7,1,1,1,850,5072,3 -1062,rsqrt_15,call_function,rsqrt.default,forward,7,1,1,1,851,5071,3 -1063,alias_default_218,call_function,alias.default,forward,7,1,1,3,852,5070,3 -1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8 -1065,alias_default_216,call_function,alias.default,forward,7,1,1,2,2,5067,2 -1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8 -1067,convert_element_type_183,call_function,convert_element_type.default,forward,7,1,1,1,858,5064,6 -1068,dtype_cast_70,call_function,dtype_cast.default,forward,7,1,1,1,1,5064,3 -1069,permute_85,call_function,permute.default,forward,7,1,1,1,2,5063,3 -1070,alias_default_219,call_function,alias.default,forward,7,1,1,4,859,5063,4 -1071,alias_default_220,call_function,alias.default,forward,7,1,1,2,3,5062,3 -1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5 -1073,alias_default_221,call_function,alias.default,forward,7,1,1,2,865,5059,4 -1074,convert_element_type_186,call_function,convert_element_type.default,forward,7,1,1,1,866,5047,4 -1075,alias_default_222,call_function,alias.default,forward,7,1,1,2,867,5046,4 -1076,neg_7,call_function,neg.default,forward,7,1,1,1,868,5045,8 -1077,exp_7,call_function,exp.default,forward,7,1,1,1,869,5044,6 -1078,add_38,call_function,add.Tensor,forward,7,1,1,1,870,5043,4 -1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6 -1080,convert_element_type_187,call_function,convert_element_type.default,forward,7,1,1,1,872,5041,6 -1081,dtype_cast_71,call_function,dtype_cast.default,forward,7,1,1,1,1,5045,3 -1082,permute_86,call_function,permute.default,forward,7,1,1,1,2,5044,3 -1083,alias_default_224,call_function,alias.default,forward,7,1,1,2,3,5043,3 -1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5 -1085,alias_default_223,call_function,alias.default,forward,7,1,1,2,873,5040,4 -1086,alias_default_225,call_function,alias.default,forward,7,1,1,2,865,5040,4 -1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8 -1088,dtype_cast_72,call_function,dtype_cast.default,forward,7,1,1,1,1,5041,3 -1089,permute_87,call_function,permute.default,forward,7,1,1,1,2,5040,3 -1090,alias_default_226,call_function,alias.default,forward,7,1,1,2,881,5038,4 -1091,alias_default_227,call_function,alias.default,forward,7,1,1,2,3,5039,3 -1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5 -1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10 -1094,dtype_cast_73,call_function,dtype_cast.default,forward,8,1,1,1,1,5025,2 -1095,alias_default_228,call_function,alias.default,forward,7,1,1,3,888,5035,4 -1096,convert_element_type_192,call_function,convert_element_type.default,forward,8,1,1,1,889,5033,4 -1097,alias_default_230,call_function,alias.default,forward,8,1,1,2,890,5032,4 -1098,pow_17,call_function,pow.Tensor_Scalar,forward,8,1,1,1,891,5031,4 -1099,mean_16,call_function,mean.dim,forward,8,1,1,1,892,5030,4 -1100,add_40,call_function,add.Scalar,forward,8,1,1,1,893,5029,3 -1101,rsqrt_16,call_function,rsqrt.default,forward,8,1,1,1,894,5028,3 -1102,alias_default_231,call_function,alias.default,forward,8,1,1,3,895,5027,3 -1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8 -1104,alias_default_229,call_function,alias.default,forward,8,1,1,2,2,5024,2 -1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8 -1106,convert_element_type_193,call_function,convert_element_type.default,forward,8,1,1,1,901,5021,6 -1107,dtype_cast_74,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3 -1108,permute_88,call_function,permute.default,forward,8,1,1,1,2,5007,3 -1109,alias_default_232,call_function,alias.default,forward,8,1,1,6,902,5020,4 -1110,alias_default_233,call_function,alias.default,forward,8,1,1,2,3,5006,3 -1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5 -1112,dtype_cast_75,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3 -1113,permute_89,call_function,permute.default,forward,8,1,1,1,2,5007,3 -1114,alias_default_234,call_function,alias.default,forward,8,1,1,2,3,5006,3 -1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5 -1116,dtype_cast_76,call_function,dtype_cast.default,forward,8,1,1,1,1,5001,3 -1117,permute_90,call_function,permute.default,forward,8,1,1,1,2,5000,3 -1118,alias_default_235,call_function,alias.default,forward,8,1,1,2,3,4999,3 -1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5 -1120,view_190,call_function,view.default,forward,8,1,1,1,908,5003,4 -1121,view_191,call_function,view.default,forward,8,1,1,1,908,5003,4 -1122,view_192,call_function,view.default,forward,8,1,1,1,908,4996,4 -1123,convert_element_type_200,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4 -1124,view_193,call_function,view.default,forward,8,1,1,1,910,5001,4 -1125,view_as_complex_16,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6 -1126,convert_element_type_201,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4 -1127,view_194,call_function,view.default,forward,8,1,1,1,910,5001,4 -1128,view_as_complex_17,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6 -1129,view_195,call_function,view.default,forward,8,1,1,1,2,5011,3 -1130,alias_default_236,call_function,alias.default,forward,8,1,1,4,3,5010,3 -1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 -1132,view_as_real_16,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6 -1133,view_196,call_function,view.default,forward,8,1,1,1,916,4997,6 -1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 -1135,view_as_real_17,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6 -1136,view_197,call_function,view.default,forward,8,1,1,1,916,4997,6 -1137,convert_element_type_202,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6 -1138,convert_element_type_203,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6 -1139,permute_91,call_function,permute.default,forward,8,1,1,1,918,4995,6 -1140,permute_92,call_function,permute.default,forward,8,1,1,1,918,4995,6 -1141,permute_93,call_function,permute.default,forward,8,1,1,1,909,4995,4 -1142,alias_default_237,call_function,alias.default,forward,8,1,1,2,919,4994,4 -1143,alias_default_238,call_function,alias.default,forward,8,1,1,2,919,4994,4 -1144,alias_default_239,call_function,alias.default,forward,8,1,1,2,910,4994,4 -1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2 -1146,getitem_72,call_function,getitem,forward,8,1,1,1,944,4989,2 -1147,getitem_73,call_function,getitem,forward,8,1,1,1,944,944,2 -1148,getitem_78,call_function,getitem,forward,8,1,1,1,944,944,1 -1149,getitem_79,call_function,getitem,forward,8,1,1,1,944,944,1 -1150,alias_default_240,call_function,alias.default,forward,8,1,1,2,945,4988,4 -1151,permute_94,call_function,permute.default,forward,8,1,1,1,946,4987,4 -1152,view_198,call_function,view.default,forward,8,1,1,1,947,4986,3 -1153,dtype_cast_77,call_function,dtype_cast.default,forward,8,1,1,1,1,4988,3 -1154,permute_95,call_function,permute.default,forward,8,1,1,1,2,4987,3 -1155,alias_default_241,call_function,alias.default,forward,8,1,1,2,948,4985,4 -1156,alias_default_242,call_function,alias.default,forward,8,1,1,2,3,4986,3 -1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5 -1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10 -1159,dtype_cast_78,call_function,dtype_cast.default,forward,8,1,1,1,1,4972,2 -1160,alias_default_243,call_function,alias.default,forward,8,1,1,3,955,4982,4 -1161,convert_element_type_206,call_function,convert_element_type.default,forward,8,1,1,1,956,4980,4 -1162,alias_default_245,call_function,alias.default,forward,8,1,1,2,957,4979,4 -1163,pow_18,call_function,pow.Tensor_Scalar,forward,8,1,1,1,958,4978,4 -1164,mean_17,call_function,mean.dim,forward,8,1,1,1,959,4977,4 -1165,add_42,call_function,add.Scalar,forward,8,1,1,1,960,4976,3 -1166,rsqrt_17,call_function,rsqrt.default,forward,8,1,1,1,961,4975,3 -1167,alias_default_246,call_function,alias.default,forward,8,1,1,3,962,4974,3 -1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8 -1169,alias_default_244,call_function,alias.default,forward,8,1,1,2,2,4971,2 -1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8 -1171,convert_element_type_207,call_function,convert_element_type.default,forward,8,1,1,1,968,4968,6 -1172,dtype_cast_79,call_function,dtype_cast.default,forward,8,1,1,1,1,4968,3 -1173,permute_96,call_function,permute.default,forward,8,1,1,1,2,4967,3 -1174,alias_default_247,call_function,alias.default,forward,8,1,1,4,969,4967,4 -1175,alias_default_248,call_function,alias.default,forward,8,1,1,2,3,4966,3 -1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5 -1177,alias_default_249,call_function,alias.default,forward,8,1,1,2,975,4963,4 -1178,convert_element_type_210,call_function,convert_element_type.default,forward,8,1,1,1,976,4951,4 -1179,alias_default_250,call_function,alias.default,forward,8,1,1,2,977,4950,4 -1180,neg_8,call_function,neg.default,forward,8,1,1,1,978,4949,8 -1181,exp_8,call_function,exp.default,forward,8,1,1,1,979,4948,6 -1182,add_43,call_function,add.Tensor,forward,8,1,1,1,980,4947,4 -1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6 -1184,convert_element_type_211,call_function,convert_element_type.default,forward,8,1,1,1,982,4945,6 -1185,dtype_cast_80,call_function,dtype_cast.default,forward,8,1,1,1,1,4949,3 -1186,permute_97,call_function,permute.default,forward,8,1,1,1,2,4948,3 -1187,alias_default_252,call_function,alias.default,forward,8,1,1,2,3,4947,3 -1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5 -1189,alias_default_251,call_function,alias.default,forward,8,1,1,2,983,4944,4 -1190,alias_default_253,call_function,alias.default,forward,8,1,1,2,975,4944,4 -1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8 -1192,dtype_cast_81,call_function,dtype_cast.default,forward,8,1,1,1,1,4945,3 -1193,permute_98,call_function,permute.default,forward,8,1,1,1,2,4944,3 -1194,alias_default_254,call_function,alias.default,forward,8,1,1,2,991,4942,4 -1195,alias_default_255,call_function,alias.default,forward,8,1,1,2,3,4943,3 -1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5 -1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10 -1198,dtype_cast_82,call_function,dtype_cast.default,forward,9,1,1,1,1,4929,2 -1199,alias_default_256,call_function,alias.default,forward,8,1,1,3,998,4939,4 -1200,convert_element_type_216,call_function,convert_element_type.default,forward,9,1,1,1,999,4937,4 -1201,alias_default_258,call_function,alias.default,forward,9,1,1,2,1000,4936,4 -1202,pow_19,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1001,4935,4 -1203,mean_18,call_function,mean.dim,forward,9,1,1,1,1002,4934,4 -1204,add_45,call_function,add.Scalar,forward,9,1,1,1,1003,4933,3 -1205,rsqrt_18,call_function,rsqrt.default,forward,9,1,1,1,1004,4932,3 -1206,alias_default_259,call_function,alias.default,forward,9,1,1,3,1005,4931,3 -1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8 -1208,alias_default_257,call_function,alias.default,forward,9,1,1,2,2,4928,2 -1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8 -1210,convert_element_type_217,call_function,convert_element_type.default,forward,9,1,1,1,1011,4925,6 -1211,dtype_cast_83,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3 -1212,permute_99,call_function,permute.default,forward,9,1,1,1,2,4911,3 -1213,alias_default_260,call_function,alias.default,forward,9,1,1,6,1012,4924,4 -1214,alias_default_261,call_function,alias.default,forward,9,1,1,2,3,4910,3 -1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 -1216,dtype_cast_84,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3 -1217,permute_100,call_function,permute.default,forward,9,1,1,1,2,4911,3 -1218,alias_default_262,call_function,alias.default,forward,9,1,1,2,3,4910,3 -1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 -1220,dtype_cast_85,call_function,dtype_cast.default,forward,9,1,1,1,1,4905,3 -1221,permute_101,call_function,permute.default,forward,9,1,1,1,2,4904,3 -1222,alias_default_263,call_function,alias.default,forward,9,1,1,2,3,4903,3 -1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5 -1224,view_213,call_function,view.default,forward,9,1,1,1,1018,4907,4 -1225,view_214,call_function,view.default,forward,9,1,1,1,1018,4907,4 -1226,view_215,call_function,view.default,forward,9,1,1,1,1018,4900,4 -1227,convert_element_type_224,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4 -1228,view_216,call_function,view.default,forward,9,1,1,1,1020,4905,4 -1229,view_as_complex_18,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6 -1230,convert_element_type_225,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4 -1231,view_217,call_function,view.default,forward,9,1,1,1,1020,4905,4 -1232,view_as_complex_19,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6 -1233,view_218,call_function,view.default,forward,9,1,1,1,2,4915,3 -1234,alias_default_264,call_function,alias.default,forward,9,1,1,4,3,4914,3 -1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 -1236,view_as_real_18,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6 -1237,view_219,call_function,view.default,forward,9,1,1,1,1026,4901,6 -1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 -1239,view_as_real_19,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6 -1240,view_220,call_function,view.default,forward,9,1,1,1,1026,4901,6 -1241,convert_element_type_226,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6 -1242,convert_element_type_227,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6 -1243,permute_102,call_function,permute.default,forward,9,1,1,1,1028,4899,6 -1244,permute_103,call_function,permute.default,forward,9,1,1,1,1028,4899,6 -1245,permute_104,call_function,permute.default,forward,9,1,1,1,1019,4899,4 -1246,alias_default_265,call_function,alias.default,forward,9,1,1,2,1029,4898,4 -1247,alias_default_266,call_function,alias.default,forward,9,1,1,2,1029,4898,4 -1248,alias_default_267,call_function,alias.default,forward,9,1,1,2,1020,4898,4 -1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2 -1250,getitem_81,call_function,getitem,forward,9,1,1,1,1054,4893,2 -1251,getitem_82,call_function,getitem,forward,9,1,1,1,1054,1054,2 -1252,getitem_87,call_function,getitem,forward,9,1,1,1,1054,1054,1 -1253,getitem_88,call_function,getitem,forward,9,1,1,1,1054,1054,1 -1254,alias_default_268,call_function,alias.default,forward,9,1,1,2,1055,4892,4 -1255,permute_105,call_function,permute.default,forward,9,1,1,1,1056,4891,4 -1256,view_221,call_function,view.default,forward,9,1,1,1,1057,4890,3 -1257,dtype_cast_86,call_function,dtype_cast.default,forward,9,1,1,1,1,4892,3 -1258,permute_106,call_function,permute.default,forward,9,1,1,1,2,4891,3 -1259,alias_default_269,call_function,alias.default,forward,9,1,1,2,1058,4889,4 -1260,alias_default_270,call_function,alias.default,forward,9,1,1,2,3,4890,3 -1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5 -1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10 -1263,dtype_cast_87,call_function,dtype_cast.default,forward,9,1,1,1,1,4876,2 -1264,alias_default_271,call_function,alias.default,forward,9,1,1,3,1065,4886,4 -1265,convert_element_type_230,call_function,convert_element_type.default,forward,9,1,1,1,1066,4884,4 -1266,alias_default_273,call_function,alias.default,forward,9,1,1,2,1067,4883,4 -1267,pow_20,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1068,4882,4 -1268,mean_19,call_function,mean.dim,forward,9,1,1,1,1069,4881,4 -1269,add_47,call_function,add.Scalar,forward,9,1,1,1,1070,4880,3 -1270,rsqrt_19,call_function,rsqrt.default,forward,9,1,1,1,1071,4879,3 -1271,alias_default_274,call_function,alias.default,forward,9,1,1,3,1072,4878,3 -1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8 -1273,alias_default_272,call_function,alias.default,forward,9,1,1,2,2,4875,2 -1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8 -1275,convert_element_type_231,call_function,convert_element_type.default,forward,9,1,1,1,1078,4872,6 -1276,dtype_cast_88,call_function,dtype_cast.default,forward,9,1,1,1,1,4872,3 -1277,permute_107,call_function,permute.default,forward,9,1,1,1,2,4871,3 -1278,alias_default_275,call_function,alias.default,forward,9,1,1,4,1079,4871,4 -1279,alias_default_276,call_function,alias.default,forward,9,1,1,2,3,4870,3 -1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5 -1281,alias_default_277,call_function,alias.default,forward,9,1,1,2,1085,4867,4 -1282,convert_element_type_234,call_function,convert_element_type.default,forward,9,1,1,1,1086,4855,4 -1283,alias_default_278,call_function,alias.default,forward,9,1,1,2,1087,4854,4 -1284,neg_9,call_function,neg.default,forward,9,1,1,1,1088,4853,8 -1285,exp_9,call_function,exp.default,forward,9,1,1,1,1089,4852,6 -1286,add_48,call_function,add.Tensor,forward,9,1,1,1,1090,4851,4 -1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6 -1288,convert_element_type_235,call_function,convert_element_type.default,forward,9,1,1,1,1092,4849,6 -1289,dtype_cast_89,call_function,dtype_cast.default,forward,9,1,1,1,1,4853,3 -1290,permute_108,call_function,permute.default,forward,9,1,1,1,2,4852,3 -1291,alias_default_280,call_function,alias.default,forward,9,1,1,2,3,4851,3 -1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5 -1293,alias_default_279,call_function,alias.default,forward,9,1,1,2,1093,4848,4 -1294,alias_default_281,call_function,alias.default,forward,9,1,1,2,1085,4848,4 -1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8 -1296,dtype_cast_90,call_function,dtype_cast.default,forward,9,1,1,1,1,4849,3 -1297,permute_109,call_function,permute.default,forward,9,1,1,1,2,4848,3 -1298,alias_default_282,call_function,alias.default,forward,9,1,1,2,1101,4846,4 -1299,alias_default_283,call_function,alias.default,forward,9,1,1,2,3,4847,3 -1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5 -1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10 -1302,dtype_cast_91,call_function,dtype_cast.default,forward,10,1,1,1,1,4833,2 -1303,alias_default_284,call_function,alias.default,forward,9,1,1,3,1108,4843,4 -1304,convert_element_type_240,call_function,convert_element_type.default,forward,10,1,1,1,1109,4841,4 -1305,alias_default_286,call_function,alias.default,forward,10,1,1,2,1110,4840,4 -1306,pow_21,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1111,4839,4 -1307,mean_20,call_function,mean.dim,forward,10,1,1,1,1112,4838,4 -1308,add_50,call_function,add.Scalar,forward,10,1,1,1,1113,4837,3 -1309,rsqrt_20,call_function,rsqrt.default,forward,10,1,1,1,1114,4836,3 -1310,alias_default_287,call_function,alias.default,forward,10,1,1,3,1115,4835,3 -1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8 -1312,alias_default_285,call_function,alias.default,forward,10,1,1,2,2,4832,2 -1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8 -1314,convert_element_type_241,call_function,convert_element_type.default,forward,10,1,1,1,1121,4829,6 -1315,dtype_cast_92,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3 -1316,permute_110,call_function,permute.default,forward,10,1,1,1,2,4815,3 -1317,alias_default_288,call_function,alias.default,forward,10,1,1,6,1122,4828,4 -1318,alias_default_289,call_function,alias.default,forward,10,1,1,2,3,4814,3 -1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 -1320,dtype_cast_93,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3 -1321,permute_111,call_function,permute.default,forward,10,1,1,1,2,4815,3 -1322,alias_default_290,call_function,alias.default,forward,10,1,1,2,3,4814,3 -1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 -1324,dtype_cast_94,call_function,dtype_cast.default,forward,10,1,1,1,1,4809,3 -1325,permute_112,call_function,permute.default,forward,10,1,1,1,2,4808,3 -1326,alias_default_291,call_function,alias.default,forward,10,1,1,2,3,4807,3 -1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5 -1328,view_236,call_function,view.default,forward,10,1,1,1,1128,4811,4 -1329,view_237,call_function,view.default,forward,10,1,1,1,1128,4811,4 -1330,view_238,call_function,view.default,forward,10,1,1,1,1128,4804,4 -1331,convert_element_type_248,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4 -1332,view_239,call_function,view.default,forward,10,1,1,1,1130,4809,4 -1333,view_as_complex_20,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6 -1334,convert_element_type_249,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4 -1335,view_240,call_function,view.default,forward,10,1,1,1,1130,4809,4 -1336,view_as_complex_21,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6 -1337,view_241,call_function,view.default,forward,10,1,1,1,2,4819,3 -1338,alias_default_292,call_function,alias.default,forward,10,1,1,4,3,4818,3 -1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 -1340,view_as_real_20,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6 -1341,view_242,call_function,view.default,forward,10,1,1,1,1136,4805,6 -1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 -1343,view_as_real_21,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6 -1344,view_243,call_function,view.default,forward,10,1,1,1,1136,4805,6 -1345,convert_element_type_250,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6 -1346,convert_element_type_251,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6 -1347,permute_113,call_function,permute.default,forward,10,1,1,1,1138,4803,6 -1348,permute_114,call_function,permute.default,forward,10,1,1,1,1138,4803,6 -1349,permute_115,call_function,permute.default,forward,10,1,1,1,1129,4803,4 -1350,alias_default_293,call_function,alias.default,forward,10,1,1,2,1139,4802,4 -1351,alias_default_294,call_function,alias.default,forward,10,1,1,2,1139,4802,4 -1352,alias_default_295,call_function,alias.default,forward,10,1,1,2,1130,4802,4 -1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2 -1354,getitem_90,call_function,getitem,forward,10,1,1,1,1164,4797,2 -1355,getitem_91,call_function,getitem,forward,10,1,1,1,1164,1164,2 -1356,getitem_96,call_function,getitem,forward,10,1,1,1,1164,1164,1 -1357,getitem_97,call_function,getitem,forward,10,1,1,1,1164,1164,1 -1358,alias_default_296,call_function,alias.default,forward,10,1,1,2,1165,4796,4 -1359,permute_116,call_function,permute.default,forward,10,1,1,1,1166,4795,4 -1360,view_244,call_function,view.default,forward,10,1,1,1,1167,4794,3 -1361,dtype_cast_95,call_function,dtype_cast.default,forward,10,1,1,1,1,4796,3 -1362,permute_117,call_function,permute.default,forward,10,1,1,1,2,4795,3 -1363,alias_default_297,call_function,alias.default,forward,10,1,1,2,1168,4793,4 -1364,alias_default_298,call_function,alias.default,forward,10,1,1,2,3,4794,3 -1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5 -1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10 -1367,dtype_cast_96,call_function,dtype_cast.default,forward,10,1,1,1,1,4780,2 -1368,alias_default_299,call_function,alias.default,forward,10,1,1,3,1175,4790,4 -1369,convert_element_type_254,call_function,convert_element_type.default,forward,10,1,1,1,1176,4788,4 -1370,alias_default_301,call_function,alias.default,forward,10,1,1,2,1177,4787,4 -1371,pow_22,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1178,4786,4 -1372,mean_21,call_function,mean.dim,forward,10,1,1,1,1179,4785,4 -1373,add_52,call_function,add.Scalar,forward,10,1,1,1,1180,4784,3 -1374,rsqrt_21,call_function,rsqrt.default,forward,10,1,1,1,1181,4783,3 -1375,alias_default_302,call_function,alias.default,forward,10,1,1,3,1182,4782,3 -1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8 -1377,alias_default_300,call_function,alias.default,forward,10,1,1,2,2,4779,2 -1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8 -1379,convert_element_type_255,call_function,convert_element_type.default,forward,10,1,1,1,1188,4776,6 -1380,dtype_cast_97,call_function,dtype_cast.default,forward,10,1,1,1,1,4776,3 -1381,permute_118,call_function,permute.default,forward,10,1,1,1,2,4775,3 -1382,alias_default_303,call_function,alias.default,forward,10,1,1,4,1189,4775,4 -1383,alias_default_304,call_function,alias.default,forward,10,1,1,2,3,4774,3 -1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5 -1385,alias_default_305,call_function,alias.default,forward,10,1,1,2,1195,4771,4 -1386,convert_element_type_258,call_function,convert_element_type.default,forward,10,1,1,1,1196,4759,4 -1387,alias_default_306,call_function,alias.default,forward,10,1,1,2,1197,4758,4 -1388,neg_10,call_function,neg.default,forward,10,1,1,1,1198,4757,8 -1389,exp_10,call_function,exp.default,forward,10,1,1,1,1199,4756,6 -1390,add_53,call_function,add.Tensor,forward,10,1,1,1,1200,4755,4 -1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6 -1392,convert_element_type_259,call_function,convert_element_type.default,forward,10,1,1,1,1202,4753,6 -1393,dtype_cast_98,call_function,dtype_cast.default,forward,10,1,1,1,1,4757,3 -1394,permute_119,call_function,permute.default,forward,10,1,1,1,2,4756,3 -1395,alias_default_308,call_function,alias.default,forward,10,1,1,2,3,4755,3 -1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5 -1397,alias_default_307,call_function,alias.default,forward,10,1,1,2,1203,4752,4 -1398,alias_default_309,call_function,alias.default,forward,10,1,1,2,1195,4752,4 -1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8 -1400,dtype_cast_99,call_function,dtype_cast.default,forward,10,1,1,1,1,4753,3 -1401,permute_120,call_function,permute.default,forward,10,1,1,1,2,4752,3 -1402,alias_default_310,call_function,alias.default,forward,10,1,1,2,1211,4750,4 -1403,alias_default_311,call_function,alias.default,forward,10,1,1,2,3,4751,3 -1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5 -1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10 -1406,dtype_cast_100,call_function,dtype_cast.default,forward,11,1,1,1,1,4737,2 -1407,alias_default_312,call_function,alias.default,forward,10,1,1,3,1218,4747,4 -1408,convert_element_type_264,call_function,convert_element_type.default,forward,11,1,1,1,1219,4745,4 -1409,alias_default_314,call_function,alias.default,forward,11,1,1,2,1220,4744,4 -1410,pow_23,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1221,4743,4 -1411,mean_22,call_function,mean.dim,forward,11,1,1,1,1222,4742,4 -1412,add_55,call_function,add.Scalar,forward,11,1,1,1,1223,4741,3 -1413,rsqrt_22,call_function,rsqrt.default,forward,11,1,1,1,1224,4740,3 -1414,alias_default_315,call_function,alias.default,forward,11,1,1,3,1225,4739,3 -1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8 -1416,alias_default_313,call_function,alias.default,forward,11,1,1,2,2,4736,2 -1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8 -1418,convert_element_type_265,call_function,convert_element_type.default,forward,11,1,1,1,1231,4733,6 -1419,dtype_cast_101,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3 -1420,permute_121,call_function,permute.default,forward,11,1,1,1,2,4719,3 -1421,alias_default_316,call_function,alias.default,forward,11,1,1,6,1232,4732,4 -1422,alias_default_317,call_function,alias.default,forward,11,1,1,2,3,4718,3 -1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 -1424,dtype_cast_102,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3 -1425,permute_122,call_function,permute.default,forward,11,1,1,1,2,4719,3 -1426,alias_default_318,call_function,alias.default,forward,11,1,1,2,3,4718,3 -1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 -1428,dtype_cast_103,call_function,dtype_cast.default,forward,11,1,1,1,1,4713,3 -1429,permute_123,call_function,permute.default,forward,11,1,1,1,2,4712,3 -1430,alias_default_319,call_function,alias.default,forward,11,1,1,2,3,4711,3 -1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5 -1432,view_259,call_function,view.default,forward,11,1,1,1,1238,4715,4 -1433,view_260,call_function,view.default,forward,11,1,1,1,1238,4715,4 -1434,view_261,call_function,view.default,forward,11,1,1,1,1238,4708,4 -1435,convert_element_type_272,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4 -1436,view_262,call_function,view.default,forward,11,1,1,1,1240,4713,4 -1437,view_as_complex_22,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6 -1438,convert_element_type_273,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4 -1439,view_263,call_function,view.default,forward,11,1,1,1,1240,4713,4 -1440,view_as_complex_23,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6 -1441,view_264,call_function,view.default,forward,11,1,1,1,2,4723,3 -1442,alias_default_320,call_function,alias.default,forward,11,1,1,4,3,4722,3 -1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 -1444,view_as_real_22,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6 -1445,view_265,call_function,view.default,forward,11,1,1,1,1246,4709,6 -1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 -1447,view_as_real_23,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6 -1448,view_266,call_function,view.default,forward,11,1,1,1,1246,4709,6 -1449,convert_element_type_274,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6 -1450,convert_element_type_275,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6 -1451,permute_124,call_function,permute.default,forward,11,1,1,1,1248,4707,6 -1452,permute_125,call_function,permute.default,forward,11,1,1,1,1248,4707,6 -1453,permute_126,call_function,permute.default,forward,11,1,1,1,1239,4707,4 -1454,alias_default_321,call_function,alias.default,forward,11,1,1,2,1249,4706,4 -1455,alias_default_322,call_function,alias.default,forward,11,1,1,2,1249,4706,4 -1456,alias_default_323,call_function,alias.default,forward,11,1,1,2,1240,4706,4 -1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2 -1458,getitem_99,call_function,getitem,forward,11,1,1,1,1274,4701,2 -1459,getitem_100,call_function,getitem,forward,11,1,1,1,1274,1274,2 -1460,getitem_105,call_function,getitem,forward,11,1,1,1,1274,1274,1 -1461,getitem_106,call_function,getitem,forward,11,1,1,1,1274,1274,1 -1462,alias_default_324,call_function,alias.default,forward,11,1,1,2,1275,4700,4 -1463,permute_127,call_function,permute.default,forward,11,1,1,1,1276,4699,4 -1464,view_267,call_function,view.default,forward,11,1,1,1,1277,4698,3 -1465,dtype_cast_104,call_function,dtype_cast.default,forward,11,1,1,1,1,4700,3 -1466,permute_128,call_function,permute.default,forward,11,1,1,1,2,4699,3 -1467,alias_default_325,call_function,alias.default,forward,11,1,1,2,1278,4697,4 -1468,alias_default_326,call_function,alias.default,forward,11,1,1,2,3,4698,3 -1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5 -1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10 -1471,dtype_cast_105,call_function,dtype_cast.default,forward,11,1,1,1,1,4684,2 -1472,alias_default_327,call_function,alias.default,forward,11,1,1,3,1285,4694,4 -1473,convert_element_type_278,call_function,convert_element_type.default,forward,11,1,1,1,1286,4692,4 -1474,alias_default_329,call_function,alias.default,forward,11,1,1,2,1287,4691,4 -1475,pow_24,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1288,4690,4 -1476,mean_23,call_function,mean.dim,forward,11,1,1,1,1289,4689,4 -1477,add_57,call_function,add.Scalar,forward,11,1,1,1,1290,4688,3 -1478,rsqrt_23,call_function,rsqrt.default,forward,11,1,1,1,1291,4687,3 -1479,alias_default_330,call_function,alias.default,forward,11,1,1,3,1292,4686,3 -1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8 -1481,alias_default_328,call_function,alias.default,forward,11,1,1,2,2,4683,2 -1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8 -1483,convert_element_type_279,call_function,convert_element_type.default,forward,11,1,1,1,1298,4680,6 -1484,dtype_cast_106,call_function,dtype_cast.default,forward,11,1,1,1,1,4680,3 -1485,permute_129,call_function,permute.default,forward,11,1,1,1,2,4679,3 -1486,alias_default_331,call_function,alias.default,forward,11,1,1,4,1299,4679,4 -1487,alias_default_332,call_function,alias.default,forward,11,1,1,2,3,4678,3 -1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5 -1489,alias_default_333,call_function,alias.default,forward,11,1,1,2,1305,4675,4 -1490,convert_element_type_282,call_function,convert_element_type.default,forward,11,1,1,1,1306,4663,4 -1491,alias_default_334,call_function,alias.default,forward,11,1,1,2,1307,4662,4 -1492,neg_11,call_function,neg.default,forward,11,1,1,1,1308,4661,8 -1493,exp_11,call_function,exp.default,forward,11,1,1,1,1309,4660,6 -1494,add_58,call_function,add.Tensor,forward,11,1,1,1,1310,4659,4 -1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6 -1496,convert_element_type_283,call_function,convert_element_type.default,forward,11,1,1,1,1312,4657,6 -1497,dtype_cast_107,call_function,dtype_cast.default,forward,11,1,1,1,1,4661,3 -1498,permute_130,call_function,permute.default,forward,11,1,1,1,2,4660,3 -1499,alias_default_336,call_function,alias.default,forward,11,1,1,2,3,4659,3 -1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5 -1501,alias_default_335,call_function,alias.default,forward,11,1,1,2,1313,4656,4 -1502,alias_default_337,call_function,alias.default,forward,11,1,1,2,1305,4656,4 -1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8 -1504,dtype_cast_108,call_function,dtype_cast.default,forward,11,1,1,1,1,4657,3 -1505,permute_131,call_function,permute.default,forward,11,1,1,1,2,4656,3 -1506,alias_default_338,call_function,alias.default,forward,11,1,1,2,1321,4654,4 -1507,alias_default_339,call_function,alias.default,forward,11,1,1,2,3,4655,3 -1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5 -1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10 -1510,dtype_cast_109,call_function,dtype_cast.default,forward,12,1,1,1,1,4641,2 -1511,alias_default_340,call_function,alias.default,forward,11,1,1,3,1328,4651,4 -1512,convert_element_type_288,call_function,convert_element_type.default,forward,12,1,1,1,1329,4649,4 -1513,alias_default_342,call_function,alias.default,forward,12,1,1,2,1330,4648,4 -1514,pow_25,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1331,4647,4 -1515,mean_24,call_function,mean.dim,forward,12,1,1,1,1332,4646,4 -1516,add_60,call_function,add.Scalar,forward,12,1,1,1,1333,4645,3 -1517,rsqrt_24,call_function,rsqrt.default,forward,12,1,1,1,1334,4644,3 -1518,alias_default_343,call_function,alias.default,forward,12,1,1,3,1335,4643,3 -1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8 -1520,alias_default_341,call_function,alias.default,forward,12,1,1,2,2,4640,2 -1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8 -1522,convert_element_type_289,call_function,convert_element_type.default,forward,12,1,1,1,1341,4637,6 -1523,dtype_cast_110,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3 -1524,permute_132,call_function,permute.default,forward,12,1,1,1,2,4623,3 -1525,alias_default_344,call_function,alias.default,forward,12,1,1,6,1342,4636,4 -1526,alias_default_345,call_function,alias.default,forward,12,1,1,2,3,4622,3 -1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 -1528,dtype_cast_111,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3 -1529,permute_133,call_function,permute.default,forward,12,1,1,1,2,4623,3 -1530,alias_default_346,call_function,alias.default,forward,12,1,1,2,3,4622,3 -1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 -1532,dtype_cast_112,call_function,dtype_cast.default,forward,12,1,1,1,1,4617,3 -1533,permute_134,call_function,permute.default,forward,12,1,1,1,2,4616,3 -1534,alias_default_347,call_function,alias.default,forward,12,1,1,2,3,4615,3 -1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5 -1536,view_282,call_function,view.default,forward,12,1,1,1,1348,4619,4 -1537,view_283,call_function,view.default,forward,12,1,1,1,1348,4619,4 -1538,view_284,call_function,view.default,forward,12,1,1,1,1348,4612,4 -1539,convert_element_type_296,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4 -1540,view_285,call_function,view.default,forward,12,1,1,1,1350,4617,4 -1541,view_as_complex_24,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6 -1542,convert_element_type_297,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4 -1543,view_286,call_function,view.default,forward,12,1,1,1,1350,4617,4 -1544,view_as_complex_25,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6 -1545,view_287,call_function,view.default,forward,12,1,1,1,2,4627,3 -1546,alias_default_348,call_function,alias.default,forward,12,1,1,4,3,4626,3 -1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 -1548,view_as_real_24,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6 -1549,view_288,call_function,view.default,forward,12,1,1,1,1356,4613,6 -1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 -1551,view_as_real_25,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6 -1552,view_289,call_function,view.default,forward,12,1,1,1,1356,4613,6 -1553,convert_element_type_298,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6 -1554,convert_element_type_299,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6 -1555,permute_135,call_function,permute.default,forward,12,1,1,1,1358,4611,6 -1556,permute_136,call_function,permute.default,forward,12,1,1,1,1358,4611,6 -1557,permute_137,call_function,permute.default,forward,12,1,1,1,1349,4611,4 -1558,alias_default_349,call_function,alias.default,forward,12,1,1,2,1359,4610,4 -1559,alias_default_350,call_function,alias.default,forward,12,1,1,2,1359,4610,4 -1560,alias_default_351,call_function,alias.default,forward,12,1,1,2,1350,4610,4 -1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2 -1562,getitem_108,call_function,getitem,forward,12,1,1,1,1384,4605,2 -1563,getitem_109,call_function,getitem,forward,12,1,1,1,1384,1384,2 -1564,getitem_114,call_function,getitem,forward,12,1,1,1,1384,1384,1 -1565,getitem_115,call_function,getitem,forward,12,1,1,1,1384,1384,1 -1566,alias_default_352,call_function,alias.default,forward,12,1,1,2,1385,4604,4 -1567,permute_138,call_function,permute.default,forward,12,1,1,1,1386,4603,4 -1568,view_290,call_function,view.default,forward,12,1,1,1,1387,4602,3 -1569,dtype_cast_113,call_function,dtype_cast.default,forward,12,1,1,1,1,4604,3 -1570,permute_139,call_function,permute.default,forward,12,1,1,1,2,4603,3 -1571,alias_default_353,call_function,alias.default,forward,12,1,1,2,1388,4601,4 -1572,alias_default_354,call_function,alias.default,forward,12,1,1,2,3,4602,3 -1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5 -1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10 -1575,dtype_cast_114,call_function,dtype_cast.default,forward,12,1,1,1,1,4588,2 -1576,alias_default_355,call_function,alias.default,forward,12,1,1,3,1395,4598,4 -1577,convert_element_type_302,call_function,convert_element_type.default,forward,12,1,1,1,1396,4596,4 -1578,alias_default_357,call_function,alias.default,forward,12,1,1,2,1397,4595,4 -1579,pow_26,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1398,4594,4 -1580,mean_25,call_function,mean.dim,forward,12,1,1,1,1399,4593,4 -1581,add_62,call_function,add.Scalar,forward,12,1,1,1,1400,4592,3 -1582,rsqrt_25,call_function,rsqrt.default,forward,12,1,1,1,1401,4591,3 -1583,alias_default_358,call_function,alias.default,forward,12,1,1,3,1402,4590,3 -1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8 -1585,alias_default_356,call_function,alias.default,forward,12,1,1,2,2,4587,2 -1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8 -1587,convert_element_type_303,call_function,convert_element_type.default,forward,12,1,1,1,1408,4584,6 -1588,dtype_cast_115,call_function,dtype_cast.default,forward,12,1,1,1,1,4584,3 -1589,permute_140,call_function,permute.default,forward,12,1,1,1,2,4583,3 -1590,alias_default_359,call_function,alias.default,forward,12,1,1,4,1409,4583,4 -1591,alias_default_360,call_function,alias.default,forward,12,1,1,2,3,4582,3 -1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5 -1593,alias_default_361,call_function,alias.default,forward,12,1,1,2,1415,4579,4 -1594,convert_element_type_306,call_function,convert_element_type.default,forward,12,1,1,1,1416,4567,4 -1595,alias_default_362,call_function,alias.default,forward,12,1,1,2,1417,4566,4 -1596,neg_12,call_function,neg.default,forward,12,1,1,1,1418,4565,8 -1597,exp_12,call_function,exp.default,forward,12,1,1,1,1419,4564,6 -1598,add_63,call_function,add.Tensor,forward,12,1,1,1,1420,4563,4 -1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6 -1600,convert_element_type_307,call_function,convert_element_type.default,forward,12,1,1,1,1422,4561,6 -1601,dtype_cast_116,call_function,dtype_cast.default,forward,12,1,1,1,1,4565,3 -1602,permute_141,call_function,permute.default,forward,12,1,1,1,2,4564,3 -1603,alias_default_364,call_function,alias.default,forward,12,1,1,2,3,4563,3 -1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5 -1605,alias_default_363,call_function,alias.default,forward,12,1,1,2,1423,4560,4 -1606,alias_default_365,call_function,alias.default,forward,12,1,1,2,1415,4560,4 -1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8 -1608,dtype_cast_117,call_function,dtype_cast.default,forward,12,1,1,1,1,4561,3 -1609,permute_142,call_function,permute.default,forward,12,1,1,1,2,4560,3 -1610,alias_default_366,call_function,alias.default,forward,12,1,1,2,1431,4558,4 -1611,alias_default_367,call_function,alias.default,forward,12,1,1,2,3,4559,3 -1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5 -1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10 -1614,dtype_cast_118,call_function,dtype_cast.default,forward,13,1,1,1,1,4545,2 -1615,alias_default_368,call_function,alias.default,forward,12,1,1,3,1438,4555,4 -1616,convert_element_type_312,call_function,convert_element_type.default,forward,13,1,1,1,1439,4553,4 -1617,alias_default_370,call_function,alias.default,forward,13,1,1,2,1440,4552,4 -1618,pow_27,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1441,4551,4 -1619,mean_26,call_function,mean.dim,forward,13,1,1,1,1442,4550,4 -1620,add_65,call_function,add.Scalar,forward,13,1,1,1,1443,4549,3 -1621,rsqrt_26,call_function,rsqrt.default,forward,13,1,1,1,1444,4548,3 -1622,alias_default_371,call_function,alias.default,forward,13,1,1,3,1445,4547,3 -1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8 -1624,alias_default_369,call_function,alias.default,forward,13,1,1,2,2,4544,2 -1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8 -1626,convert_element_type_313,call_function,convert_element_type.default,forward,13,1,1,1,1451,4541,6 -1627,dtype_cast_119,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3 -1628,permute_143,call_function,permute.default,forward,13,1,1,1,2,4527,3 -1629,alias_default_372,call_function,alias.default,forward,13,1,1,6,1452,4540,4 -1630,alias_default_373,call_function,alias.default,forward,13,1,1,2,3,4526,3 -1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 -1632,dtype_cast_120,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3 -1633,permute_144,call_function,permute.default,forward,13,1,1,1,2,4527,3 -1634,alias_default_374,call_function,alias.default,forward,13,1,1,2,3,4526,3 -1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 -1636,dtype_cast_121,call_function,dtype_cast.default,forward,13,1,1,1,1,4521,3 -1637,permute_145,call_function,permute.default,forward,13,1,1,1,2,4520,3 -1638,alias_default_375,call_function,alias.default,forward,13,1,1,2,3,4519,3 -1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5 -1640,view_305,call_function,view.default,forward,13,1,1,1,1458,4523,4 -1641,view_306,call_function,view.default,forward,13,1,1,1,1458,4523,4 -1642,view_307,call_function,view.default,forward,13,1,1,1,1458,4516,4 -1643,convert_element_type_320,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4 -1644,view_308,call_function,view.default,forward,13,1,1,1,1460,4521,4 -1645,view_as_complex_26,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6 -1646,convert_element_type_321,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4 -1647,view_309,call_function,view.default,forward,13,1,1,1,1460,4521,4 -1648,view_as_complex_27,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6 -1649,view_310,call_function,view.default,forward,13,1,1,1,2,4531,3 -1650,alias_default_376,call_function,alias.default,forward,13,1,1,4,3,4530,3 -1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 -1652,view_as_real_26,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6 -1653,view_311,call_function,view.default,forward,13,1,1,1,1466,4517,6 -1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 -1655,view_as_real_27,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6 -1656,view_312,call_function,view.default,forward,13,1,1,1,1466,4517,6 -1657,convert_element_type_322,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6 -1658,convert_element_type_323,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6 -1659,permute_146,call_function,permute.default,forward,13,1,1,1,1468,4515,6 -1660,permute_147,call_function,permute.default,forward,13,1,1,1,1468,4515,6 -1661,permute_148,call_function,permute.default,forward,13,1,1,1,1459,4515,4 -1662,alias_default_377,call_function,alias.default,forward,13,1,1,2,1469,4514,4 -1663,alias_default_378,call_function,alias.default,forward,13,1,1,2,1469,4514,4 -1664,alias_default_379,call_function,alias.default,forward,13,1,1,2,1460,4514,4 -1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2 -1666,getitem_117,call_function,getitem,forward,13,1,1,1,1494,4509,2 -1667,getitem_118,call_function,getitem,forward,13,1,1,1,1494,1494,2 -1668,getitem_123,call_function,getitem,forward,13,1,1,1,1494,1494,1 -1669,getitem_124,call_function,getitem,forward,13,1,1,1,1494,1494,1 -1670,alias_default_380,call_function,alias.default,forward,13,1,1,2,1495,4508,4 -1671,permute_149,call_function,permute.default,forward,13,1,1,1,1496,4507,4 -1672,view_313,call_function,view.default,forward,13,1,1,1,1497,4506,3 -1673,dtype_cast_122,call_function,dtype_cast.default,forward,13,1,1,1,1,4508,3 -1674,permute_150,call_function,permute.default,forward,13,1,1,1,2,4507,3 -1675,alias_default_381,call_function,alias.default,forward,13,1,1,2,1498,4505,4 -1676,alias_default_382,call_function,alias.default,forward,13,1,1,2,3,4506,3 -1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5 -1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10 -1679,dtype_cast_123,call_function,dtype_cast.default,forward,13,1,1,1,1,4492,2 -1680,alias_default_383,call_function,alias.default,forward,13,1,1,3,1505,4502,4 -1681,convert_element_type_326,call_function,convert_element_type.default,forward,13,1,1,1,1506,4500,4 -1682,alias_default_385,call_function,alias.default,forward,13,1,1,2,1507,4499,4 -1683,pow_28,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1508,4498,4 -1684,mean_27,call_function,mean.dim,forward,13,1,1,1,1509,4497,4 -1685,add_67,call_function,add.Scalar,forward,13,1,1,1,1510,4496,3 -1686,rsqrt_27,call_function,rsqrt.default,forward,13,1,1,1,1511,4495,3 -1687,alias_default_386,call_function,alias.default,forward,13,1,1,3,1512,4494,3 -1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8 -1689,alias_default_384,call_function,alias.default,forward,13,1,1,2,2,4491,2 -1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8 -1691,convert_element_type_327,call_function,convert_element_type.default,forward,13,1,1,1,1518,4488,6 -1692,dtype_cast_124,call_function,dtype_cast.default,forward,13,1,1,1,1,4488,3 -1693,permute_151,call_function,permute.default,forward,13,1,1,1,2,4487,3 -1694,alias_default_387,call_function,alias.default,forward,13,1,1,4,1519,4487,4 -1695,alias_default_388,call_function,alias.default,forward,13,1,1,2,3,4486,3 -1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5 -1697,alias_default_389,call_function,alias.default,forward,13,1,1,2,1525,4483,4 -1698,convert_element_type_330,call_function,convert_element_type.default,forward,13,1,1,1,1526,4471,4 -1699,alias_default_390,call_function,alias.default,forward,13,1,1,2,1527,4470,4 -1700,neg_13,call_function,neg.default,forward,13,1,1,1,1528,4469,8 -1701,exp_13,call_function,exp.default,forward,13,1,1,1,1529,4468,6 -1702,add_68,call_function,add.Tensor,forward,13,1,1,1,1530,4467,4 -1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6 -1704,convert_element_type_331,call_function,convert_element_type.default,forward,13,1,1,1,1532,4465,6 -1705,dtype_cast_125,call_function,dtype_cast.default,forward,13,1,1,1,1,4469,3 -1706,permute_152,call_function,permute.default,forward,13,1,1,1,2,4468,3 -1707,alias_default_392,call_function,alias.default,forward,13,1,1,2,3,4467,3 -1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5 -1709,alias_default_391,call_function,alias.default,forward,13,1,1,2,1533,4464,4 -1710,alias_default_393,call_function,alias.default,forward,13,1,1,2,1525,4464,4 -1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8 -1712,dtype_cast_126,call_function,dtype_cast.default,forward,13,1,1,1,1,4465,3 -1713,permute_153,call_function,permute.default,forward,13,1,1,1,2,4464,3 -1714,alias_default_394,call_function,alias.default,forward,13,1,1,2,1541,4462,4 -1715,alias_default_395,call_function,alias.default,forward,13,1,1,2,3,4463,3 -1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5 -1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10 -1718,dtype_cast_127,call_function,dtype_cast.default,forward,14,1,1,1,1,4449,2 -1719,alias_default_396,call_function,alias.default,forward,13,1,1,3,1548,4459,4 -1720,convert_element_type_336,call_function,convert_element_type.default,forward,14,1,1,1,1549,4457,4 -1721,alias_default_398,call_function,alias.default,forward,14,1,1,2,1550,4456,4 -1722,pow_29,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1551,4455,4 -1723,mean_28,call_function,mean.dim,forward,14,1,1,1,1552,4454,4 -1724,add_70,call_function,add.Scalar,forward,14,1,1,1,1553,4453,3 -1725,rsqrt_28,call_function,rsqrt.default,forward,14,1,1,1,1554,4452,3 -1726,alias_default_399,call_function,alias.default,forward,14,1,1,3,1555,4451,3 -1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8 -1728,alias_default_397,call_function,alias.default,forward,14,1,1,2,2,4448,2 -1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8 -1730,convert_element_type_337,call_function,convert_element_type.default,forward,14,1,1,1,1561,4445,6 -1731,dtype_cast_128,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3 -1732,permute_154,call_function,permute.default,forward,14,1,1,1,2,4431,3 -1733,alias_default_400,call_function,alias.default,forward,14,1,1,6,1562,4444,4 -1734,alias_default_401,call_function,alias.default,forward,14,1,1,2,3,4430,3 -1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 -1736,dtype_cast_129,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3 -1737,permute_155,call_function,permute.default,forward,14,1,1,1,2,4431,3 -1738,alias_default_402,call_function,alias.default,forward,14,1,1,2,3,4430,3 -1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 -1740,dtype_cast_130,call_function,dtype_cast.default,forward,14,1,1,1,1,4425,3 -1741,permute_156,call_function,permute.default,forward,14,1,1,1,2,4424,3 -1742,alias_default_403,call_function,alias.default,forward,14,1,1,2,3,4423,3 -1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5 -1744,view_328,call_function,view.default,forward,14,1,1,1,1568,4427,4 -1745,view_329,call_function,view.default,forward,14,1,1,1,1568,4427,4 -1746,view_330,call_function,view.default,forward,14,1,1,1,1568,4420,4 -1747,convert_element_type_344,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4 -1748,view_331,call_function,view.default,forward,14,1,1,1,1570,4425,4 -1749,view_as_complex_28,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6 -1750,convert_element_type_345,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4 -1751,view_332,call_function,view.default,forward,14,1,1,1,1570,4425,4 -1752,view_as_complex_29,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6 -1753,view_333,call_function,view.default,forward,14,1,1,1,2,4435,3 -1754,alias_default_404,call_function,alias.default,forward,14,1,1,4,3,4434,3 -1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 -1756,view_as_real_28,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6 -1757,view_334,call_function,view.default,forward,14,1,1,1,1576,4421,6 -1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 -1759,view_as_real_29,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6 -1760,view_335,call_function,view.default,forward,14,1,1,1,1576,4421,6 -1761,convert_element_type_346,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6 -1762,convert_element_type_347,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6 -1763,permute_157,call_function,permute.default,forward,14,1,1,1,1578,4419,6 -1764,permute_158,call_function,permute.default,forward,14,1,1,1,1578,4419,6 -1765,permute_159,call_function,permute.default,forward,14,1,1,1,1569,4419,4 -1766,alias_default_405,call_function,alias.default,forward,14,1,1,2,1579,4418,4 -1767,alias_default_406,call_function,alias.default,forward,14,1,1,2,1579,4418,4 -1768,alias_default_407,call_function,alias.default,forward,14,1,1,2,1570,4418,4 -1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2 -1770,getitem_126,call_function,getitem,forward,14,1,1,1,1604,4413,2 -1771,getitem_127,call_function,getitem,forward,14,1,1,1,1604,1604,2 -1772,getitem_132,call_function,getitem,forward,14,1,1,1,1604,1604,1 -1773,getitem_133,call_function,getitem,forward,14,1,1,1,1604,1604,1 -1774,alias_default_408,call_function,alias.default,forward,14,1,1,2,1605,4412,4 -1775,permute_160,call_function,permute.default,forward,14,1,1,1,1606,4411,4 -1776,view_336,call_function,view.default,forward,14,1,1,1,1607,4410,3 -1777,dtype_cast_131,call_function,dtype_cast.default,forward,14,1,1,1,1,4412,3 -1778,permute_161,call_function,permute.default,forward,14,1,1,1,2,4411,3 -1779,alias_default_409,call_function,alias.default,forward,14,1,1,2,1608,4409,4 -1780,alias_default_410,call_function,alias.default,forward,14,1,1,2,3,4410,3 -1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5 -1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10 -1783,dtype_cast_132,call_function,dtype_cast.default,forward,14,1,1,1,1,4396,2 -1784,alias_default_411,call_function,alias.default,forward,14,1,1,3,1615,4406,4 -1785,convert_element_type_350,call_function,convert_element_type.default,forward,14,1,1,1,1616,4404,4 -1786,alias_default_413,call_function,alias.default,forward,14,1,1,2,1617,4403,4 -1787,pow_30,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1618,4402,4 -1788,mean_29,call_function,mean.dim,forward,14,1,1,1,1619,4401,4 -1789,add_72,call_function,add.Scalar,forward,14,1,1,1,1620,4400,3 -1790,rsqrt_29,call_function,rsqrt.default,forward,14,1,1,1,1621,4399,3 -1791,alias_default_414,call_function,alias.default,forward,14,1,1,3,1622,4398,3 -1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8 -1793,alias_default_412,call_function,alias.default,forward,14,1,1,2,2,4395,2 -1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8 -1795,convert_element_type_351,call_function,convert_element_type.default,forward,14,1,1,1,1628,4392,6 -1796,dtype_cast_133,call_function,dtype_cast.default,forward,14,1,1,1,1,4392,3 -1797,permute_162,call_function,permute.default,forward,14,1,1,1,2,4391,3 -1798,alias_default_415,call_function,alias.default,forward,14,1,1,4,1629,4391,4 -1799,alias_default_416,call_function,alias.default,forward,14,1,1,2,3,4390,3 -1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5 -1801,alias_default_417,call_function,alias.default,forward,14,1,1,2,1635,4387,4 -1802,convert_element_type_354,call_function,convert_element_type.default,forward,14,1,1,1,1636,4375,4 -1803,alias_default_418,call_function,alias.default,forward,14,1,1,2,1637,4374,4 -1804,neg_14,call_function,neg.default,forward,14,1,1,1,1638,4373,8 -1805,exp_14,call_function,exp.default,forward,14,1,1,1,1639,4372,6 -1806,add_73,call_function,add.Tensor,forward,14,1,1,1,1640,4371,4 -1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6 -1808,convert_element_type_355,call_function,convert_element_type.default,forward,14,1,1,1,1642,4369,6 -1809,dtype_cast_134,call_function,dtype_cast.default,forward,14,1,1,1,1,4373,3 -1810,permute_163,call_function,permute.default,forward,14,1,1,1,2,4372,3 -1811,alias_default_420,call_function,alias.default,forward,14,1,1,2,3,4371,3 -1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5 -1813,alias_default_419,call_function,alias.default,forward,14,1,1,2,1643,4368,4 -1814,alias_default_421,call_function,alias.default,forward,14,1,1,2,1635,4368,4 -1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8 -1816,dtype_cast_135,call_function,dtype_cast.default,forward,14,1,1,1,1,4369,3 -1817,permute_164,call_function,permute.default,forward,14,1,1,1,2,4368,3 -1818,alias_default_422,call_function,alias.default,forward,14,1,1,2,1651,4366,4 -1819,alias_default_423,call_function,alias.default,forward,14,1,1,2,3,4367,3 -1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5 -1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10 -1822,dtype_cast_136,call_function,dtype_cast.default,forward,15,1,1,1,1,4353,2 -1823,alias_default_424,call_function,alias.default,forward,14,1,1,3,1658,4363,4 -1824,convert_element_type_360,call_function,convert_element_type.default,forward,15,1,1,1,1659,4361,4 -1825,alias_default_426,call_function,alias.default,forward,15,1,1,2,1660,4360,4 -1826,pow_31,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1661,4359,4 -1827,mean_30,call_function,mean.dim,forward,15,1,1,1,1662,4358,4 -1828,add_75,call_function,add.Scalar,forward,15,1,1,1,1663,4357,3 -1829,rsqrt_30,call_function,rsqrt.default,forward,15,1,1,1,1664,4356,3 -1830,alias_default_427,call_function,alias.default,forward,15,1,1,3,1665,4355,3 -1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8 -1832,alias_default_425,call_function,alias.default,forward,15,1,1,2,2,4352,2 -1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8 -1834,convert_element_type_361,call_function,convert_element_type.default,forward,15,1,1,1,1671,4349,6 -1835,dtype_cast_137,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3 -1836,permute_165,call_function,permute.default,forward,15,1,1,1,2,4335,3 -1837,alias_default_428,call_function,alias.default,forward,15,1,1,6,1672,4348,4 -1838,alias_default_429,call_function,alias.default,forward,15,1,1,2,3,4334,3 -1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 -1840,dtype_cast_138,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3 -1841,permute_166,call_function,permute.default,forward,15,1,1,1,2,4335,3 -1842,alias_default_430,call_function,alias.default,forward,15,1,1,2,3,4334,3 -1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 -1844,dtype_cast_139,call_function,dtype_cast.default,forward,15,1,1,1,1,4329,3 -1845,permute_167,call_function,permute.default,forward,15,1,1,1,2,4328,3 -1846,alias_default_431,call_function,alias.default,forward,15,1,1,2,3,4327,3 -1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5 -1848,view_351,call_function,view.default,forward,15,1,1,1,1678,4331,4 -1849,view_352,call_function,view.default,forward,15,1,1,1,1678,4331,4 -1850,view_353,call_function,view.default,forward,15,1,1,1,1678,4324,4 -1851,convert_element_type_368,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4 -1852,view_354,call_function,view.default,forward,15,1,1,1,1680,4329,4 -1853,view_as_complex_30,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6 -1854,convert_element_type_369,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4 -1855,view_355,call_function,view.default,forward,15,1,1,1,1680,4329,4 -1856,view_as_complex_31,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6 -1857,view_356,call_function,view.default,forward,15,1,1,1,2,4339,3 -1858,alias_default_432,call_function,alias.default,forward,15,1,1,4,3,4338,3 -1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 -1860,view_as_real_30,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6 -1861,view_357,call_function,view.default,forward,15,1,1,1,1686,4325,6 -1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 -1863,view_as_real_31,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6 -1864,view_358,call_function,view.default,forward,15,1,1,1,1686,4325,6 -1865,convert_element_type_370,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6 -1866,convert_element_type_371,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6 -1867,permute_168,call_function,permute.default,forward,15,1,1,1,1688,4323,6 -1868,permute_169,call_function,permute.default,forward,15,1,1,1,1688,4323,6 -1869,permute_170,call_function,permute.default,forward,15,1,1,1,1679,4323,4 -1870,alias_default_433,call_function,alias.default,forward,15,1,1,2,1689,4322,4 -1871,alias_default_434,call_function,alias.default,forward,15,1,1,2,1689,4322,4 -1872,alias_default_435,call_function,alias.default,forward,15,1,1,2,1680,4322,4 -1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2 -1874,getitem_135,call_function,getitem,forward,15,1,1,1,1714,4317,2 -1875,getitem_136,call_function,getitem,forward,15,1,1,1,1714,1714,2 -1876,getitem_141,call_function,getitem,forward,15,1,1,1,1714,1714,1 -1877,getitem_142,call_function,getitem,forward,15,1,1,1,1714,1714,1 -1878,alias_default_436,call_function,alias.default,forward,15,1,1,2,1715,4316,4 -1879,permute_171,call_function,permute.default,forward,15,1,1,1,1716,4315,4 -1880,view_359,call_function,view.default,forward,15,1,1,1,1717,4314,3 -1881,dtype_cast_140,call_function,dtype_cast.default,forward,15,1,1,1,1,4316,3 -1882,permute_172,call_function,permute.default,forward,15,1,1,1,2,4315,3 -1883,alias_default_437,call_function,alias.default,forward,15,1,1,2,1718,4313,4 -1884,alias_default_438,call_function,alias.default,forward,15,1,1,2,3,4314,3 -1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5 -1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10 -1887,dtype_cast_141,call_function,dtype_cast.default,forward,15,1,1,1,1,4300,2 -1888,alias_default_439,call_function,alias.default,forward,15,1,1,3,1725,4310,4 -1889,convert_element_type_374,call_function,convert_element_type.default,forward,15,1,1,1,1726,4308,4 -1890,alias_default_441,call_function,alias.default,forward,15,1,1,2,1727,4307,4 -1891,pow_32,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1728,4306,4 -1892,mean_31,call_function,mean.dim,forward,15,1,1,1,1729,4305,4 -1893,add_77,call_function,add.Scalar,forward,15,1,1,1,1730,4304,3 -1894,rsqrt_31,call_function,rsqrt.default,forward,15,1,1,1,1731,4303,3 -1895,alias_default_442,call_function,alias.default,forward,15,1,1,3,1732,4302,3 -1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8 -1897,alias_default_440,call_function,alias.default,forward,15,1,1,2,2,4299,2 -1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8 -1899,convert_element_type_375,call_function,convert_element_type.default,forward,15,1,1,1,1738,4296,6 -1900,dtype_cast_142,call_function,dtype_cast.default,forward,15,1,1,1,1,4296,3 -1901,permute_173,call_function,permute.default,forward,15,1,1,1,2,4295,3 -1902,alias_default_443,call_function,alias.default,forward,15,1,1,4,1739,4295,4 -1903,alias_default_444,call_function,alias.default,forward,15,1,1,2,3,4294,3 -1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5 -1905,alias_default_445,call_function,alias.default,forward,15,1,1,2,1745,4291,4 -1906,convert_element_type_378,call_function,convert_element_type.default,forward,15,1,1,1,1746,4279,4 -1907,alias_default_446,call_function,alias.default,forward,15,1,1,2,1747,4278,4 -1908,neg_15,call_function,neg.default,forward,15,1,1,1,1748,4277,8 -1909,exp_15,call_function,exp.default,forward,15,1,1,1,1749,4276,6 -1910,add_78,call_function,add.Tensor,forward,15,1,1,1,1750,4275,4 -1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6 -1912,convert_element_type_379,call_function,convert_element_type.default,forward,15,1,1,1,1752,4273,6 -1913,dtype_cast_143,call_function,dtype_cast.default,forward,15,1,1,1,1,4277,3 -1914,permute_174,call_function,permute.default,forward,15,1,1,1,2,4276,3 -1915,alias_default_448,call_function,alias.default,forward,15,1,1,2,3,4275,3 -1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5 -1917,alias_default_447,call_function,alias.default,forward,15,1,1,2,1753,4272,4 -1918,alias_default_449,call_function,alias.default,forward,15,1,1,2,1745,4272,4 -1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8 -1920,dtype_cast_144,call_function,dtype_cast.default,forward,15,1,1,1,1,4273,3 -1921,permute_175,call_function,permute.default,forward,15,1,1,1,2,4272,3 -1922,alias_default_450,call_function,alias.default,forward,15,1,1,2,1761,4270,4 -1923,alias_default_451,call_function,alias.default,forward,15,1,1,2,3,4271,3 -1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5 -1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10 -1926,dtype_cast_145,call_function,dtype_cast.default,forward,16,1,1,1,1,4257,2 -1927,alias_default_452,call_function,alias.default,forward,15,1,1,3,1768,4267,4 -1928,convert_element_type_384,call_function,convert_element_type.default,forward,16,1,1,1,1769,4265,4 -1929,alias_default_454,call_function,alias.default,forward,16,1,1,2,1770,4264,4 -1930,pow_33,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1771,4263,4 -1931,mean_32,call_function,mean.dim,forward,16,1,1,1,1772,4262,4 -1932,add_80,call_function,add.Scalar,forward,16,1,1,1,1773,4261,3 -1933,rsqrt_32,call_function,rsqrt.default,forward,16,1,1,1,1774,4260,3 -1934,alias_default_455,call_function,alias.default,forward,16,1,1,3,1775,4259,3 -1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8 -1936,alias_default_453,call_function,alias.default,forward,16,1,1,2,2,4256,2 -1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8 -1938,convert_element_type_385,call_function,convert_element_type.default,forward,16,1,1,1,1781,4253,6 -1939,dtype_cast_146,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3 -1940,permute_176,call_function,permute.default,forward,16,1,1,1,2,4239,3 -1941,alias_default_456,call_function,alias.default,forward,16,1,1,6,1782,4252,4 -1942,alias_default_457,call_function,alias.default,forward,16,1,1,2,3,4238,3 -1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 -1944,dtype_cast_147,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3 -1945,permute_177,call_function,permute.default,forward,16,1,1,1,2,4239,3 -1946,alias_default_458,call_function,alias.default,forward,16,1,1,2,3,4238,3 -1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 -1948,dtype_cast_148,call_function,dtype_cast.default,forward,16,1,1,1,1,4233,3 -1949,permute_178,call_function,permute.default,forward,16,1,1,1,2,4232,3 -1950,alias_default_459,call_function,alias.default,forward,16,1,1,2,3,4231,3 -1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5 -1952,view_374,call_function,view.default,forward,16,1,1,1,1788,4235,4 -1953,view_375,call_function,view.default,forward,16,1,1,1,1788,4235,4 -1954,view_376,call_function,view.default,forward,16,1,1,1,1788,4228,4 -1955,convert_element_type_392,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4 -1956,view_377,call_function,view.default,forward,16,1,1,1,1790,4233,4 -1957,view_as_complex_32,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6 -1958,convert_element_type_393,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4 -1959,view_378,call_function,view.default,forward,16,1,1,1,1790,4233,4 -1960,view_as_complex_33,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6 -1961,view_379,call_function,view.default,forward,16,1,1,1,2,4243,3 -1962,alias_default_460,call_function,alias.default,forward,16,1,1,4,3,4242,3 -1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 -1964,view_as_real_32,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6 -1965,view_380,call_function,view.default,forward,16,1,1,1,1796,4229,6 -1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 -1967,view_as_real_33,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6 -1968,view_381,call_function,view.default,forward,16,1,1,1,1796,4229,6 -1969,convert_element_type_394,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6 -1970,convert_element_type_395,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6 -1971,permute_179,call_function,permute.default,forward,16,1,1,1,1798,4227,6 -1972,permute_180,call_function,permute.default,forward,16,1,1,1,1798,4227,6 -1973,permute_181,call_function,permute.default,forward,16,1,1,1,1789,4227,4 -1974,alias_default_461,call_function,alias.default,forward,16,1,1,2,1799,4226,4 -1975,alias_default_462,call_function,alias.default,forward,16,1,1,2,1799,4226,4 -1976,alias_default_463,call_function,alias.default,forward,16,1,1,2,1790,4226,4 -1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2 -1978,getitem_144,call_function,getitem,forward,16,1,1,1,1824,4221,2 -1979,getitem_145,call_function,getitem,forward,16,1,1,1,1824,1824,2 -1980,getitem_150,call_function,getitem,forward,16,1,1,1,1824,1824,1 -1981,getitem_151,call_function,getitem,forward,16,1,1,1,1824,1824,1 -1982,alias_default_464,call_function,alias.default,forward,16,1,1,2,1825,4220,4 -1983,permute_182,call_function,permute.default,forward,16,1,1,1,1826,4219,4 -1984,view_382,call_function,view.default,forward,16,1,1,1,1827,4218,3 -1985,dtype_cast_149,call_function,dtype_cast.default,forward,16,1,1,1,1,4220,3 -1986,permute_183,call_function,permute.default,forward,16,1,1,1,2,4219,3 -1987,alias_default_465,call_function,alias.default,forward,16,1,1,2,1828,4217,4 -1988,alias_default_466,call_function,alias.default,forward,16,1,1,2,3,4218,3 -1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5 -1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10 -1991,dtype_cast_150,call_function,dtype_cast.default,forward,16,1,1,1,1,4204,2 -1992,alias_default_467,call_function,alias.default,forward,16,1,1,3,1835,4214,4 -1993,convert_element_type_398,call_function,convert_element_type.default,forward,16,1,1,1,1836,4212,4 -1994,alias_default_469,call_function,alias.default,forward,16,1,1,2,1837,4211,4 -1995,pow_34,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1838,4210,4 -1996,mean_33,call_function,mean.dim,forward,16,1,1,1,1839,4209,4 -1997,add_82,call_function,add.Scalar,forward,16,1,1,1,1840,4208,3 -1998,rsqrt_33,call_function,rsqrt.default,forward,16,1,1,1,1841,4207,3 -1999,alias_default_470,call_function,alias.default,forward,16,1,1,3,1842,4206,3 -2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8 -2001,alias_default_468,call_function,alias.default,forward,16,1,1,2,2,4203,2 -2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8 -2003,convert_element_type_399,call_function,convert_element_type.default,forward,16,1,1,1,1848,4200,6 -2004,dtype_cast_151,call_function,dtype_cast.default,forward,16,1,1,1,1,4200,3 -2005,permute_184,call_function,permute.default,forward,16,1,1,1,2,4199,3 -2006,alias_default_471,call_function,alias.default,forward,16,1,1,4,1849,4199,4 -2007,alias_default_472,call_function,alias.default,forward,16,1,1,2,3,4198,3 -2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5 -2009,alias_default_473,call_function,alias.default,forward,16,1,1,2,1855,4195,4 -2010,convert_element_type_402,call_function,convert_element_type.default,forward,16,1,1,1,1856,4183,4 -2011,alias_default_474,call_function,alias.default,forward,16,1,1,2,1857,4182,4 -2012,neg_16,call_function,neg.default,forward,16,1,1,1,1858,4181,8 -2013,exp_16,call_function,exp.default,forward,16,1,1,1,1859,4180,6 -2014,add_83,call_function,add.Tensor,forward,16,1,1,1,1860,4179,4 -2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6 -2016,convert_element_type_403,call_function,convert_element_type.default,forward,16,1,1,1,1862,4177,6 -2017,dtype_cast_152,call_function,dtype_cast.default,forward,16,1,1,1,1,4181,3 -2018,permute_185,call_function,permute.default,forward,16,1,1,1,2,4180,3 -2019,alias_default_476,call_function,alias.default,forward,16,1,1,2,3,4179,3 -2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5 -2021,alias_default_475,call_function,alias.default,forward,16,1,1,2,1863,4176,4 -2022,alias_default_477,call_function,alias.default,forward,16,1,1,2,1855,4176,4 -2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8 -2024,dtype_cast_153,call_function,dtype_cast.default,forward,16,1,1,1,1,4177,3 -2025,permute_186,call_function,permute.default,forward,16,1,1,1,2,4176,3 -2026,alias_default_478,call_function,alias.default,forward,16,1,1,2,1871,4174,4 -2027,alias_default_479,call_function,alias.default,forward,16,1,1,2,3,4175,3 -2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5 -2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10 -2030,dtype_cast_154,call_function,dtype_cast.default,forward,17,1,1,1,1,4161,2 -2031,alias_default_480,call_function,alias.default,forward,16,1,1,3,1878,4171,4 -2032,convert_element_type_408,call_function,convert_element_type.default,forward,17,1,1,1,1879,4169,4 -2033,alias_default_482,call_function,alias.default,forward,17,1,1,2,1880,4168,4 -2034,pow_35,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1881,4167,4 -2035,mean_34,call_function,mean.dim,forward,17,1,1,1,1882,4166,4 -2036,add_85,call_function,add.Scalar,forward,17,1,1,1,1883,4165,3 -2037,rsqrt_34,call_function,rsqrt.default,forward,17,1,1,1,1884,4164,3 -2038,alias_default_483,call_function,alias.default,forward,17,1,1,3,1885,4163,3 -2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8 -2040,alias_default_481,call_function,alias.default,forward,17,1,1,2,2,4160,2 -2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8 -2042,convert_element_type_409,call_function,convert_element_type.default,forward,17,1,1,1,1891,4157,6 -2043,dtype_cast_155,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3 -2044,permute_187,call_function,permute.default,forward,17,1,1,1,2,4143,3 -2045,alias_default_484,call_function,alias.default,forward,17,1,1,6,1892,4156,4 -2046,alias_default_485,call_function,alias.default,forward,17,1,1,2,3,4142,3 -2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 -2048,dtype_cast_156,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3 -2049,permute_188,call_function,permute.default,forward,17,1,1,1,2,4143,3 -2050,alias_default_486,call_function,alias.default,forward,17,1,1,2,3,4142,3 -2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 -2052,dtype_cast_157,call_function,dtype_cast.default,forward,17,1,1,1,1,4137,3 -2053,permute_189,call_function,permute.default,forward,17,1,1,1,2,4136,3 -2054,alias_default_487,call_function,alias.default,forward,17,1,1,2,3,4135,3 -2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5 -2056,view_397,call_function,view.default,forward,17,1,1,1,1898,4139,4 -2057,view_398,call_function,view.default,forward,17,1,1,1,1898,4139,4 -2058,view_399,call_function,view.default,forward,17,1,1,1,1898,4132,4 -2059,convert_element_type_416,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4 -2060,view_400,call_function,view.default,forward,17,1,1,1,1900,4137,4 -2061,view_as_complex_34,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6 -2062,convert_element_type_417,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4 -2063,view_401,call_function,view.default,forward,17,1,1,1,1900,4137,4 -2064,view_as_complex_35,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6 -2065,view_402,call_function,view.default,forward,17,1,1,1,2,4147,3 -2066,alias_default_488,call_function,alias.default,forward,17,1,1,4,3,4146,3 -2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 -2068,view_as_real_34,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6 -2069,view_403,call_function,view.default,forward,17,1,1,1,1906,4133,6 -2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 -2071,view_as_real_35,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6 -2072,view_404,call_function,view.default,forward,17,1,1,1,1906,4133,6 -2073,convert_element_type_418,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6 -2074,convert_element_type_419,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6 -2075,permute_190,call_function,permute.default,forward,17,1,1,1,1908,4131,6 -2076,permute_191,call_function,permute.default,forward,17,1,1,1,1908,4131,6 -2077,permute_192,call_function,permute.default,forward,17,1,1,1,1899,4131,4 -2078,alias_default_489,call_function,alias.default,forward,17,1,1,2,1909,4130,4 -2079,alias_default_490,call_function,alias.default,forward,17,1,1,2,1909,4130,4 -2080,alias_default_491,call_function,alias.default,forward,17,1,1,2,1900,4130,4 -2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2 -2082,getitem_153,call_function,getitem,forward,17,1,1,1,1934,4125,2 -2083,getitem_154,call_function,getitem,forward,17,1,1,1,1934,1934,2 -2084,getitem_159,call_function,getitem,forward,17,1,1,1,1934,1934,1 -2085,getitem_160,call_function,getitem,forward,17,1,1,1,1934,1934,1 -2086,alias_default_492,call_function,alias.default,forward,17,1,1,2,1935,4124,4 -2087,permute_193,call_function,permute.default,forward,17,1,1,1,1936,4123,4 -2088,view_405,call_function,view.default,forward,17,1,1,1,1937,4122,3 -2089,dtype_cast_158,call_function,dtype_cast.default,forward,17,1,1,1,1,4124,3 -2090,permute_194,call_function,permute.default,forward,17,1,1,1,2,4123,3 -2091,alias_default_493,call_function,alias.default,forward,17,1,1,2,1938,4121,4 -2092,alias_default_494,call_function,alias.default,forward,17,1,1,2,3,4122,3 -2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5 -2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10 -2095,dtype_cast_159,call_function,dtype_cast.default,forward,17,1,1,1,1,4108,2 -2096,alias_default_495,call_function,alias.default,forward,17,1,1,3,1945,4118,4 -2097,convert_element_type_422,call_function,convert_element_type.default,forward,17,1,1,1,1946,4116,4 -2098,alias_default_497,call_function,alias.default,forward,17,1,1,2,1947,4115,4 -2099,pow_36,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1948,4114,4 -2100,mean_35,call_function,mean.dim,forward,17,1,1,1,1949,4113,4 -2101,add_87,call_function,add.Scalar,forward,17,1,1,1,1950,4112,3 -2102,rsqrt_35,call_function,rsqrt.default,forward,17,1,1,1,1951,4111,3 -2103,alias_default_498,call_function,alias.default,forward,17,1,1,3,1952,4110,3 -2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8 -2105,alias_default_496,call_function,alias.default,forward,17,1,1,2,2,4107,2 -2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8 -2107,convert_element_type_423,call_function,convert_element_type.default,forward,17,1,1,1,1958,4104,6 -2108,dtype_cast_160,call_function,dtype_cast.default,forward,17,1,1,1,1,4104,3 -2109,permute_195,call_function,permute.default,forward,17,1,1,1,2,4103,3 -2110,alias_default_499,call_function,alias.default,forward,17,1,1,4,1959,4103,4 -2111,alias_default_500,call_function,alias.default,forward,17,1,1,2,3,4102,3 -2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5 -2113,alias_default_501,call_function,alias.default,forward,17,1,1,2,1965,4099,4 -2114,convert_element_type_426,call_function,convert_element_type.default,forward,17,1,1,1,1966,4087,4 -2115,alias_default_502,call_function,alias.default,forward,17,1,1,2,1967,4086,4 -2116,neg_17,call_function,neg.default,forward,17,1,1,1,1968,4085,8 -2117,exp_17,call_function,exp.default,forward,17,1,1,1,1969,4084,6 -2118,add_88,call_function,add.Tensor,forward,17,1,1,1,1970,4083,4 -2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6 -2120,convert_element_type_427,call_function,convert_element_type.default,forward,17,1,1,1,1972,4081,6 -2121,dtype_cast_161,call_function,dtype_cast.default,forward,17,1,1,1,1,4085,3 -2122,permute_196,call_function,permute.default,forward,17,1,1,1,2,4084,3 -2123,alias_default_504,call_function,alias.default,forward,17,1,1,2,3,4083,3 -2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5 -2125,alias_default_503,call_function,alias.default,forward,17,1,1,2,1973,4080,4 -2126,alias_default_505,call_function,alias.default,forward,17,1,1,2,1965,4080,4 -2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8 -2128,dtype_cast_162,call_function,dtype_cast.default,forward,17,1,1,1,1,4081,3 -2129,permute_197,call_function,permute.default,forward,17,1,1,1,2,4080,3 -2130,alias_default_506,call_function,alias.default,forward,17,1,1,2,1981,4078,4 -2131,alias_default_507,call_function,alias.default,forward,17,1,1,2,3,4079,3 -2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5 -2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10 -2134,dtype_cast_163,call_function,dtype_cast.default,forward,18,1,1,1,1,4065,2 -2135,alias_default_508,call_function,alias.default,forward,17,1,1,3,1988,4075,4 -2136,convert_element_type_432,call_function,convert_element_type.default,forward,18,1,1,1,1989,4073,4 -2137,alias_default_510,call_function,alias.default,forward,18,1,1,2,1990,4072,4 -2138,pow_37,call_function,pow.Tensor_Scalar,forward,18,1,1,1,1991,4071,4 -2139,mean_36,call_function,mean.dim,forward,18,1,1,1,1992,4070,4 -2140,add_90,call_function,add.Scalar,forward,18,1,1,1,1993,4069,3 -2141,rsqrt_36,call_function,rsqrt.default,forward,18,1,1,1,1994,4068,3 -2142,alias_default_511,call_function,alias.default,forward,18,1,1,3,1995,4067,3 -2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8 -2144,alias_default_509,call_function,alias.default,forward,18,1,1,2,2,4064,2 -2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8 -2146,convert_element_type_433,call_function,convert_element_type.default,forward,18,1,1,1,2001,4061,6 -2147,dtype_cast_164,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3 -2148,permute_198,call_function,permute.default,forward,18,1,1,1,2,4047,3 -2149,alias_default_512,call_function,alias.default,forward,18,1,1,6,2002,4060,4 -2150,alias_default_513,call_function,alias.default,forward,18,1,1,2,3,4046,3 -2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 -2152,dtype_cast_165,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3 -2153,permute_199,call_function,permute.default,forward,18,1,1,1,2,4047,3 -2154,alias_default_514,call_function,alias.default,forward,18,1,1,2,3,4046,3 -2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 -2156,dtype_cast_166,call_function,dtype_cast.default,forward,18,1,1,1,1,4041,3 -2157,permute_200,call_function,permute.default,forward,18,1,1,1,2,4040,3 -2158,alias_default_515,call_function,alias.default,forward,18,1,1,2,3,4039,3 -2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5 -2160,view_420,call_function,view.default,forward,18,1,1,1,2008,4043,4 -2161,view_421,call_function,view.default,forward,18,1,1,1,2008,4043,4 -2162,view_422,call_function,view.default,forward,18,1,1,1,2008,4036,4 -2163,convert_element_type_440,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4 -2164,view_423,call_function,view.default,forward,18,1,1,1,2010,4041,4 -2165,view_as_complex_36,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6 -2166,convert_element_type_441,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4 -2167,view_424,call_function,view.default,forward,18,1,1,1,2010,4041,4 -2168,view_as_complex_37,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6 -2169,view_425,call_function,view.default,forward,18,1,1,1,2,4051,3 -2170,alias_default_516,call_function,alias.default,forward,18,1,1,4,3,4050,3 -2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 -2172,view_as_real_36,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6 -2173,view_426,call_function,view.default,forward,18,1,1,1,2016,4037,6 -2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 -2175,view_as_real_37,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6 -2176,view_427,call_function,view.default,forward,18,1,1,1,2016,4037,6 -2177,convert_element_type_442,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6 -2178,convert_element_type_443,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6 -2179,permute_201,call_function,permute.default,forward,18,1,1,1,2018,4035,6 -2180,permute_202,call_function,permute.default,forward,18,1,1,1,2018,4035,6 -2181,permute_203,call_function,permute.default,forward,18,1,1,1,2009,4035,4 -2182,alias_default_517,call_function,alias.default,forward,18,1,1,2,2019,4034,4 -2183,alias_default_518,call_function,alias.default,forward,18,1,1,2,2019,4034,4 -2184,alias_default_519,call_function,alias.default,forward,18,1,1,2,2010,4034,4 -2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2 -2186,getitem_162,call_function,getitem,forward,18,1,1,1,2044,4029,2 -2187,getitem_163,call_function,getitem,forward,18,1,1,1,2044,2044,2 -2188,getitem_168,call_function,getitem,forward,18,1,1,1,2044,2044,1 -2189,getitem_169,call_function,getitem,forward,18,1,1,1,2044,2044,1 -2190,alias_default_520,call_function,alias.default,forward,18,1,1,2,2045,4028,4 -2191,permute_204,call_function,permute.default,forward,18,1,1,1,2046,4027,4 -2192,view_428,call_function,view.default,forward,18,1,1,1,2047,4026,3 -2193,dtype_cast_167,call_function,dtype_cast.default,forward,18,1,1,1,1,4028,3 -2194,permute_205,call_function,permute.default,forward,18,1,1,1,2,4027,3 -2195,alias_default_521,call_function,alias.default,forward,18,1,1,2,2048,4025,4 -2196,alias_default_522,call_function,alias.default,forward,18,1,1,2,3,4026,3 -2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5 -2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10 -2199,dtype_cast_168,call_function,dtype_cast.default,forward,18,1,1,1,1,4012,2 -2200,alias_default_523,call_function,alias.default,forward,18,1,1,3,2055,4022,4 -2201,convert_element_type_446,call_function,convert_element_type.default,forward,18,1,1,1,2056,4020,4 -2202,alias_default_525,call_function,alias.default,forward,18,1,1,2,2057,4019,4 -2203,pow_38,call_function,pow.Tensor_Scalar,forward,18,1,1,1,2058,4018,4 -2204,mean_37,call_function,mean.dim,forward,18,1,1,1,2059,4017,4 -2205,add_92,call_function,add.Scalar,forward,18,1,1,1,2060,4016,3 -2206,rsqrt_37,call_function,rsqrt.default,forward,18,1,1,1,2061,4015,3 -2207,alias_default_526,call_function,alias.default,forward,18,1,1,3,2062,4014,3 -2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8 -2209,alias_default_524,call_function,alias.default,forward,18,1,1,2,2,4011,2 -2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8 -2211,convert_element_type_447,call_function,convert_element_type.default,forward,18,1,1,1,2068,4008,6 -2212,dtype_cast_169,call_function,dtype_cast.default,forward,18,1,1,1,1,4008,3 -2213,permute_206,call_function,permute.default,forward,18,1,1,1,2,4007,3 -2214,alias_default_527,call_function,alias.default,forward,18,1,1,4,2069,4007,4 -2215,alias_default_528,call_function,alias.default,forward,18,1,1,2,3,4006,3 -2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5 -2217,alias_default_529,call_function,alias.default,forward,18,1,1,2,2075,4003,4 -2218,convert_element_type_450,call_function,convert_element_type.default,forward,18,1,1,1,2076,3991,4 -2219,alias_default_530,call_function,alias.default,forward,18,1,1,2,2077,3990,4 -2220,neg_18,call_function,neg.default,forward,18,1,1,1,2078,3989,8 -2221,exp_18,call_function,exp.default,forward,18,1,1,1,2079,3988,6 -2222,add_93,call_function,add.Tensor,forward,18,1,1,1,2080,3987,4 -2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6 -2224,convert_element_type_451,call_function,convert_element_type.default,forward,18,1,1,1,2082,3985,6 -2225,dtype_cast_170,call_function,dtype_cast.default,forward,18,1,1,1,1,3989,3 -2226,permute_207,call_function,permute.default,forward,18,1,1,1,2,3988,3 -2227,alias_default_532,call_function,alias.default,forward,18,1,1,2,3,3987,3 -2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5 -2229,alias_default_531,call_function,alias.default,forward,18,1,1,2,2083,3984,4 -2230,alias_default_533,call_function,alias.default,forward,18,1,1,2,2075,3984,4 -2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8 -2232,dtype_cast_171,call_function,dtype_cast.default,forward,18,1,1,1,1,3985,3 -2233,permute_208,call_function,permute.default,forward,18,1,1,1,2,3984,3 -2234,alias_default_534,call_function,alias.default,forward,18,1,1,2,2091,3982,4 -2235,alias_default_535,call_function,alias.default,forward,18,1,1,2,3,3983,3 -2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5 -2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10 -2238,dtype_cast_172,call_function,dtype_cast.default,forward,19,1,1,1,1,3969,2 -2239,alias_default_536,call_function,alias.default,forward,18,1,1,3,2098,3979,4 -2240,convert_element_type_456,call_function,convert_element_type.default,forward,19,1,1,1,2099,3977,4 -2241,alias_default_538,call_function,alias.default,forward,19,1,1,2,2100,3976,4 -2242,pow_39,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2101,3975,4 -2243,mean_38,call_function,mean.dim,forward,19,1,1,1,2102,3974,4 -2244,add_95,call_function,add.Scalar,forward,19,1,1,1,2103,3973,3 -2245,rsqrt_38,call_function,rsqrt.default,forward,19,1,1,1,2104,3972,3 -2246,alias_default_539,call_function,alias.default,forward,19,1,1,3,2105,3971,3 -2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8 -2248,alias_default_537,call_function,alias.default,forward,19,1,1,2,2,3968,2 -2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8 -2250,convert_element_type_457,call_function,convert_element_type.default,forward,19,1,1,1,2111,3965,6 -2251,dtype_cast_173,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3 -2252,permute_209,call_function,permute.default,forward,19,1,1,1,2,3951,3 -2253,alias_default_540,call_function,alias.default,forward,19,1,1,6,2112,3964,4 -2254,alias_default_541,call_function,alias.default,forward,19,1,1,2,3,3950,3 -2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 -2256,dtype_cast_174,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3 -2257,permute_210,call_function,permute.default,forward,19,1,1,1,2,3951,3 -2258,alias_default_542,call_function,alias.default,forward,19,1,1,2,3,3950,3 -2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 -2260,dtype_cast_175,call_function,dtype_cast.default,forward,19,1,1,1,1,3945,3 -2261,permute_211,call_function,permute.default,forward,19,1,1,1,2,3944,3 -2262,alias_default_543,call_function,alias.default,forward,19,1,1,2,3,3943,3 -2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5 -2264,view_443,call_function,view.default,forward,19,1,1,1,2118,3947,4 -2265,view_444,call_function,view.default,forward,19,1,1,1,2118,3947,4 -2266,view_445,call_function,view.default,forward,19,1,1,1,2118,3940,4 -2267,convert_element_type_464,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4 -2268,view_446,call_function,view.default,forward,19,1,1,1,2120,3945,4 -2269,view_as_complex_38,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6 -2270,convert_element_type_465,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4 -2271,view_447,call_function,view.default,forward,19,1,1,1,2120,3945,4 -2272,view_as_complex_39,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6 -2273,view_448,call_function,view.default,forward,19,1,1,1,2,3955,3 -2274,alias_default_544,call_function,alias.default,forward,19,1,1,4,3,3954,3 -2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 -2276,view_as_real_38,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6 -2277,view_449,call_function,view.default,forward,19,1,1,1,2126,3941,6 -2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 -2279,view_as_real_39,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6 -2280,view_450,call_function,view.default,forward,19,1,1,1,2126,3941,6 -2281,convert_element_type_466,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6 -2282,convert_element_type_467,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6 -2283,permute_212,call_function,permute.default,forward,19,1,1,1,2128,3939,6 -2284,permute_213,call_function,permute.default,forward,19,1,1,1,2128,3939,6 -2285,permute_214,call_function,permute.default,forward,19,1,1,1,2119,3939,4 -2286,alias_default_545,call_function,alias.default,forward,19,1,1,2,2129,3938,4 -2287,alias_default_546,call_function,alias.default,forward,19,1,1,2,2129,3938,4 -2288,alias_default_547,call_function,alias.default,forward,19,1,1,2,2120,3938,4 -2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2 -2290,getitem_171,call_function,getitem,forward,19,1,1,1,2154,3933,2 -2291,getitem_172,call_function,getitem,forward,19,1,1,1,2154,2154,2 -2292,getitem_177,call_function,getitem,forward,19,1,1,1,2154,2154,1 -2293,getitem_178,call_function,getitem,forward,19,1,1,1,2154,2154,1 -2294,alias_default_548,call_function,alias.default,forward,19,1,1,2,2155,3932,4 -2295,permute_215,call_function,permute.default,forward,19,1,1,1,2156,3931,4 -2296,view_451,call_function,view.default,forward,19,1,1,1,2157,3930,3 -2297,dtype_cast_176,call_function,dtype_cast.default,forward,19,1,1,1,1,3932,3 -2298,permute_216,call_function,permute.default,forward,19,1,1,1,2,3931,3 -2299,alias_default_549,call_function,alias.default,forward,19,1,1,2,2158,3929,4 -2300,alias_default_550,call_function,alias.default,forward,19,1,1,2,3,3930,3 -2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5 -2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10 -2303,dtype_cast_177,call_function,dtype_cast.default,forward,19,1,1,1,1,3916,2 -2304,alias_default_551,call_function,alias.default,forward,19,1,1,3,2165,3926,4 -2305,convert_element_type_470,call_function,convert_element_type.default,forward,19,1,1,1,2166,3924,4 -2306,alias_default_553,call_function,alias.default,forward,19,1,1,2,2167,3923,4 -2307,pow_40,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2168,3922,4 -2308,mean_39,call_function,mean.dim,forward,19,1,1,1,2169,3921,4 -2309,add_97,call_function,add.Scalar,forward,19,1,1,1,2170,3920,3 -2310,rsqrt_39,call_function,rsqrt.default,forward,19,1,1,1,2171,3919,3 -2311,alias_default_554,call_function,alias.default,forward,19,1,1,3,2172,3918,3 -2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8 -2313,alias_default_552,call_function,alias.default,forward,19,1,1,2,2,3915,2 -2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8 -2315,convert_element_type_471,call_function,convert_element_type.default,forward,19,1,1,1,2178,3912,6 -2316,dtype_cast_178,call_function,dtype_cast.default,forward,19,1,1,1,1,3912,3 -2317,permute_217,call_function,permute.default,forward,19,1,1,1,2,3911,3 -2318,alias_default_555,call_function,alias.default,forward,19,1,1,4,2179,3911,4 -2319,alias_default_556,call_function,alias.default,forward,19,1,1,2,3,3910,3 -2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5 -2321,alias_default_557,call_function,alias.default,forward,19,1,1,2,2185,3907,4 -2322,convert_element_type_474,call_function,convert_element_type.default,forward,19,1,1,1,2186,3895,4 -2323,alias_default_558,call_function,alias.default,forward,19,1,1,2,2187,3894,4 -2324,neg_19,call_function,neg.default,forward,19,1,1,1,2188,3893,8 -2325,exp_19,call_function,exp.default,forward,19,1,1,1,2189,3892,6 -2326,add_98,call_function,add.Tensor,forward,19,1,1,1,2190,3891,4 -2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6 -2328,convert_element_type_475,call_function,convert_element_type.default,forward,19,1,1,1,2192,3889,6 -2329,dtype_cast_179,call_function,dtype_cast.default,forward,19,1,1,1,1,3893,3 -2330,permute_218,call_function,permute.default,forward,19,1,1,1,2,3892,3 -2331,alias_default_560,call_function,alias.default,forward,19,1,1,2,3,3891,3 -2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5 -2333,alias_default_559,call_function,alias.default,forward,19,1,1,2,2193,3888,4 -2334,alias_default_561,call_function,alias.default,forward,19,1,1,2,2185,3888,4 -2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8 -2336,dtype_cast_180,call_function,dtype_cast.default,forward,19,1,1,1,1,3889,3 -2337,permute_219,call_function,permute.default,forward,19,1,1,1,2,3888,3 -2338,alias_default_562,call_function,alias.default,forward,19,1,1,2,2201,3886,4 -2339,alias_default_563,call_function,alias.default,forward,19,1,1,2,3,3887,3 -2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5 -2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10 -2342,dtype_cast_181,call_function,dtype_cast.default,forward,20,1,1,1,1,3873,2 -2343,alias_default_564,call_function,alias.default,forward,19,1,1,3,2208,3883,4 -2344,convert_element_type_480,call_function,convert_element_type.default,forward,20,1,1,1,2209,3881,4 -2345,alias_default_566,call_function,alias.default,forward,20,1,1,2,2210,3880,4 -2346,pow_41,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2211,3879,4 -2347,mean_40,call_function,mean.dim,forward,20,1,1,1,2212,3878,4 -2348,add_100,call_function,add.Scalar,forward,20,1,1,1,2213,3877,3 -2349,rsqrt_40,call_function,rsqrt.default,forward,20,1,1,1,2214,3876,3 -2350,alias_default_567,call_function,alias.default,forward,20,1,1,3,2215,3875,3 -2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8 -2352,alias_default_565,call_function,alias.default,forward,20,1,1,2,2,3872,2 -2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8 -2354,convert_element_type_481,call_function,convert_element_type.default,forward,20,1,1,1,2221,3869,6 -2355,dtype_cast_182,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3 -2356,permute_220,call_function,permute.default,forward,20,1,1,1,2,3855,3 -2357,alias_default_568,call_function,alias.default,forward,20,1,1,6,2222,3868,4 -2358,alias_default_569,call_function,alias.default,forward,20,1,1,2,3,3854,3 -2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 -2360,dtype_cast_183,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3 -2361,permute_221,call_function,permute.default,forward,20,1,1,1,2,3855,3 -2362,alias_default_570,call_function,alias.default,forward,20,1,1,2,3,3854,3 -2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 -2364,dtype_cast_184,call_function,dtype_cast.default,forward,20,1,1,1,1,3849,3 -2365,permute_222,call_function,permute.default,forward,20,1,1,1,2,3848,3 -2366,alias_default_571,call_function,alias.default,forward,20,1,1,2,3,3847,3 -2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5 -2368,view_466,call_function,view.default,forward,20,1,1,1,2228,3851,4 -2369,view_467,call_function,view.default,forward,20,1,1,1,2228,3851,4 -2370,view_468,call_function,view.default,forward,20,1,1,1,2228,3844,4 -2371,convert_element_type_488,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4 -2372,view_469,call_function,view.default,forward,20,1,1,1,2230,3849,4 -2373,view_as_complex_40,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6 -2374,convert_element_type_489,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4 -2375,view_470,call_function,view.default,forward,20,1,1,1,2230,3849,4 -2376,view_as_complex_41,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6 -2377,view_471,call_function,view.default,forward,20,1,1,1,2,3859,3 -2378,alias_default_572,call_function,alias.default,forward,20,1,1,4,3,3858,3 -2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 -2380,view_as_real_40,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6 -2381,view_472,call_function,view.default,forward,20,1,1,1,2236,3845,6 -2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 -2383,view_as_real_41,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6 -2384,view_473,call_function,view.default,forward,20,1,1,1,2236,3845,6 -2385,convert_element_type_490,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6 -2386,convert_element_type_491,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6 -2387,permute_223,call_function,permute.default,forward,20,1,1,1,2238,3843,6 -2388,permute_224,call_function,permute.default,forward,20,1,1,1,2238,3843,6 -2389,permute_225,call_function,permute.default,forward,20,1,1,1,2229,3843,4 -2390,alias_default_573,call_function,alias.default,forward,20,1,1,2,2239,3842,4 -2391,alias_default_574,call_function,alias.default,forward,20,1,1,2,2239,3842,4 -2392,alias_default_575,call_function,alias.default,forward,20,1,1,2,2230,3842,4 -2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2 -2394,getitem_180,call_function,getitem,forward,20,1,1,1,2264,3837,2 -2395,getitem_181,call_function,getitem,forward,20,1,1,1,2264,2264,2 -2396,getitem_186,call_function,getitem,forward,20,1,1,1,2264,2264,1 -2397,getitem_187,call_function,getitem,forward,20,1,1,1,2264,2264,1 -2398,alias_default_576,call_function,alias.default,forward,20,1,1,2,2265,3836,4 -2399,permute_226,call_function,permute.default,forward,20,1,1,1,2266,3835,4 -2400,view_474,call_function,view.default,forward,20,1,1,1,2267,3834,3 -2401,dtype_cast_185,call_function,dtype_cast.default,forward,20,1,1,1,1,3836,3 -2402,permute_227,call_function,permute.default,forward,20,1,1,1,2,3835,3 -2403,alias_default_577,call_function,alias.default,forward,20,1,1,2,2268,3833,4 -2404,alias_default_578,call_function,alias.default,forward,20,1,1,2,3,3834,3 -2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5 -2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10 -2407,dtype_cast_186,call_function,dtype_cast.default,forward,20,1,1,1,1,3820,2 -2408,alias_default_579,call_function,alias.default,forward,20,1,1,3,2275,3830,4 -2409,convert_element_type_494,call_function,convert_element_type.default,forward,20,1,1,1,2276,3828,4 -2410,alias_default_581,call_function,alias.default,forward,20,1,1,2,2277,3827,4 -2411,pow_42,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2278,3826,4 -2412,mean_41,call_function,mean.dim,forward,20,1,1,1,2279,3825,4 -2413,add_102,call_function,add.Scalar,forward,20,1,1,1,2280,3824,3 -2414,rsqrt_41,call_function,rsqrt.default,forward,20,1,1,1,2281,3823,3 -2415,alias_default_582,call_function,alias.default,forward,20,1,1,3,2282,3822,3 -2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8 -2417,alias_default_580,call_function,alias.default,forward,20,1,1,2,2,3819,2 -2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8 -2419,convert_element_type_495,call_function,convert_element_type.default,forward,20,1,1,1,2288,3816,6 -2420,dtype_cast_187,call_function,dtype_cast.default,forward,20,1,1,1,1,3816,3 -2421,permute_228,call_function,permute.default,forward,20,1,1,1,2,3815,3 -2422,alias_default_583,call_function,alias.default,forward,20,1,1,4,2289,3815,4 -2423,alias_default_584,call_function,alias.default,forward,20,1,1,2,3,3814,3 -2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5 -2425,alias_default_585,call_function,alias.default,forward,20,1,1,2,2295,3811,4 -2426,convert_element_type_498,call_function,convert_element_type.default,forward,20,1,1,1,2296,3799,4 -2427,alias_default_586,call_function,alias.default,forward,20,1,1,2,2297,3798,4 -2428,neg_20,call_function,neg.default,forward,20,1,1,1,2298,3797,8 -2429,exp_20,call_function,exp.default,forward,20,1,1,1,2299,3796,6 -2430,add_103,call_function,add.Tensor,forward,20,1,1,1,2300,3795,4 -2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6 -2432,convert_element_type_499,call_function,convert_element_type.default,forward,20,1,1,1,2302,3793,6 -2433,dtype_cast_188,call_function,dtype_cast.default,forward,20,1,1,1,1,3797,3 -2434,permute_229,call_function,permute.default,forward,20,1,1,1,2,3796,3 -2435,alias_default_588,call_function,alias.default,forward,20,1,1,2,3,3795,3 -2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5 -2437,alias_default_587,call_function,alias.default,forward,20,1,1,2,2303,3792,4 -2438,alias_default_589,call_function,alias.default,forward,20,1,1,2,2295,3792,4 -2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8 -2440,dtype_cast_189,call_function,dtype_cast.default,forward,20,1,1,1,1,3793,3 -2441,permute_230,call_function,permute.default,forward,20,1,1,1,2,3792,3 -2442,alias_default_590,call_function,alias.default,forward,20,1,1,2,2311,3790,4 -2443,alias_default_591,call_function,alias.default,forward,20,1,1,2,3,3791,3 -2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5 -2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10 -2446,dtype_cast_190,call_function,dtype_cast.default,forward,21,1,1,1,1,3777,2 -2447,alias_default_592,call_function,alias.default,forward,20,1,1,3,2318,3787,4 -2448,convert_element_type_504,call_function,convert_element_type.default,forward,21,1,1,1,2319,3785,4 -2449,alias_default_594,call_function,alias.default,forward,21,1,1,2,2320,3784,4 -2450,pow_43,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2321,3783,4 -2451,mean_42,call_function,mean.dim,forward,21,1,1,1,2322,3782,4 -2452,add_105,call_function,add.Scalar,forward,21,1,1,1,2323,3781,3 -2453,rsqrt_42,call_function,rsqrt.default,forward,21,1,1,1,2324,3780,3 -2454,alias_default_595,call_function,alias.default,forward,21,1,1,3,2325,3779,3 -2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8 -2456,alias_default_593,call_function,alias.default,forward,21,1,1,2,2,3776,2 -2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8 -2458,convert_element_type_505,call_function,convert_element_type.default,forward,21,1,1,1,2331,3773,6 -2459,dtype_cast_191,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3 -2460,permute_231,call_function,permute.default,forward,21,1,1,1,2,3759,3 -2461,alias_default_596,call_function,alias.default,forward,21,1,1,6,2332,3772,4 -2462,alias_default_597,call_function,alias.default,forward,21,1,1,2,3,3758,3 -2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 -2464,dtype_cast_192,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3 -2465,permute_232,call_function,permute.default,forward,21,1,1,1,2,3759,3 -2466,alias_default_598,call_function,alias.default,forward,21,1,1,2,3,3758,3 -2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 -2468,dtype_cast_193,call_function,dtype_cast.default,forward,21,1,1,1,1,3753,3 -2469,permute_233,call_function,permute.default,forward,21,1,1,1,2,3752,3 -2470,alias_default_599,call_function,alias.default,forward,21,1,1,2,3,3751,3 -2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5 -2472,view_489,call_function,view.default,forward,21,1,1,1,2338,3755,4 -2473,view_490,call_function,view.default,forward,21,1,1,1,2338,3755,4 -2474,view_491,call_function,view.default,forward,21,1,1,1,2338,3748,4 -2475,convert_element_type_512,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4 -2476,view_492,call_function,view.default,forward,21,1,1,1,2340,3753,4 -2477,view_as_complex_42,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6 -2478,convert_element_type_513,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4 -2479,view_493,call_function,view.default,forward,21,1,1,1,2340,3753,4 -2480,view_as_complex_43,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6 -2481,view_494,call_function,view.default,forward,21,1,1,1,2,3763,3 -2482,alias_default_600,call_function,alias.default,forward,21,1,1,4,3,3762,3 -2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 -2484,view_as_real_42,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6 -2485,view_495,call_function,view.default,forward,21,1,1,1,2346,3749,6 -2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 -2487,view_as_real_43,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6 -2488,view_496,call_function,view.default,forward,21,1,1,1,2346,3749,6 -2489,convert_element_type_514,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6 -2490,convert_element_type_515,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6 -2491,permute_234,call_function,permute.default,forward,21,1,1,1,2348,3747,6 -2492,permute_235,call_function,permute.default,forward,21,1,1,1,2348,3747,6 -2493,permute_236,call_function,permute.default,forward,21,1,1,1,2339,3747,4 -2494,alias_default_601,call_function,alias.default,forward,21,1,1,2,2349,3746,4 -2495,alias_default_602,call_function,alias.default,forward,21,1,1,2,2349,3746,4 -2496,alias_default_603,call_function,alias.default,forward,21,1,1,2,2340,3746,4 -2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2 -2498,getitem_189,call_function,getitem,forward,21,1,1,1,2374,3741,2 -2499,getitem_190,call_function,getitem,forward,21,1,1,1,2374,2374,2 -2500,getitem_195,call_function,getitem,forward,21,1,1,1,2374,2374,1 -2501,getitem_196,call_function,getitem,forward,21,1,1,1,2374,2374,1 -2502,alias_default_604,call_function,alias.default,forward,21,1,1,2,2375,3740,4 -2503,permute_237,call_function,permute.default,forward,21,1,1,1,2376,3739,4 -2504,view_497,call_function,view.default,forward,21,1,1,1,2377,3738,3 -2505,dtype_cast_194,call_function,dtype_cast.default,forward,21,1,1,1,1,3740,3 -2506,permute_238,call_function,permute.default,forward,21,1,1,1,2,3739,3 -2507,alias_default_605,call_function,alias.default,forward,21,1,1,2,2378,3737,4 -2508,alias_default_606,call_function,alias.default,forward,21,1,1,2,3,3738,3 -2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5 -2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10 -2511,dtype_cast_195,call_function,dtype_cast.default,forward,21,1,1,1,1,3724,2 -2512,alias_default_607,call_function,alias.default,forward,21,1,1,3,2385,3734,4 -2513,convert_element_type_518,call_function,convert_element_type.default,forward,21,1,1,1,2386,3732,4 -2514,alias_default_609,call_function,alias.default,forward,21,1,1,2,2387,3731,4 -2515,pow_44,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2388,3730,4 -2516,mean_43,call_function,mean.dim,forward,21,1,1,1,2389,3729,4 -2517,add_107,call_function,add.Scalar,forward,21,1,1,1,2390,3728,3 -2518,rsqrt_43,call_function,rsqrt.default,forward,21,1,1,1,2391,3727,3 -2519,alias_default_610,call_function,alias.default,forward,21,1,1,3,2392,3726,3 -2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8 -2521,alias_default_608,call_function,alias.default,forward,21,1,1,2,2,3723,2 -2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8 -2523,convert_element_type_519,call_function,convert_element_type.default,forward,21,1,1,1,2398,3720,6 -2524,dtype_cast_196,call_function,dtype_cast.default,forward,21,1,1,1,1,3720,3 -2525,permute_239,call_function,permute.default,forward,21,1,1,1,2,3719,3 -2526,alias_default_611,call_function,alias.default,forward,21,1,1,4,2399,3719,4 -2527,alias_default_612,call_function,alias.default,forward,21,1,1,2,3,3718,3 -2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5 -2529,alias_default_613,call_function,alias.default,forward,21,1,1,2,2405,3715,4 -2530,convert_element_type_522,call_function,convert_element_type.default,forward,21,1,1,1,2406,3703,4 -2531,alias_default_614,call_function,alias.default,forward,21,1,1,2,2407,3702,4 -2532,neg_21,call_function,neg.default,forward,21,1,1,1,2408,3701,8 -2533,exp_21,call_function,exp.default,forward,21,1,1,1,2409,3700,6 -2534,add_108,call_function,add.Tensor,forward,21,1,1,1,2410,3699,4 -2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6 -2536,convert_element_type_523,call_function,convert_element_type.default,forward,21,1,1,1,2412,3697,6 -2537,dtype_cast_197,call_function,dtype_cast.default,forward,21,1,1,1,1,3701,3 -2538,permute_240,call_function,permute.default,forward,21,1,1,1,2,3700,3 -2539,alias_default_616,call_function,alias.default,forward,21,1,1,2,3,3699,3 -2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5 -2541,alias_default_615,call_function,alias.default,forward,21,1,1,2,2413,3696,4 -2542,alias_default_617,call_function,alias.default,forward,21,1,1,2,2405,3696,4 -2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8 -2544,dtype_cast_198,call_function,dtype_cast.default,forward,21,1,1,1,1,3697,3 -2545,permute_241,call_function,permute.default,forward,21,1,1,1,2,3696,3 -2546,alias_default_618,call_function,alias.default,forward,21,1,1,2,2421,3694,4 -2547,alias_default_619,call_function,alias.default,forward,21,1,1,2,3,3695,3 -2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5 -2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10 -2550,dtype_cast_199,call_function,dtype_cast.default,forward,22,1,1,1,1,3681,2 -2551,alias_default_620,call_function,alias.default,forward,21,1,1,3,2428,3691,4 -2552,convert_element_type_528,call_function,convert_element_type.default,forward,22,1,1,1,2429,3689,4 -2553,alias_default_622,call_function,alias.default,forward,22,1,1,2,2430,3688,4 -2554,pow_45,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2431,3687,4 -2555,mean_44,call_function,mean.dim,forward,22,1,1,1,2432,3686,4 -2556,add_110,call_function,add.Scalar,forward,22,1,1,1,2433,3685,3 -2557,rsqrt_44,call_function,rsqrt.default,forward,22,1,1,1,2434,3684,3 -2558,alias_default_623,call_function,alias.default,forward,22,1,1,3,2435,3683,3 -2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8 -2560,alias_default_621,call_function,alias.default,forward,22,1,1,2,2,3680,2 -2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8 -2562,convert_element_type_529,call_function,convert_element_type.default,forward,22,1,1,1,2441,3677,6 -2563,dtype_cast_200,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3 -2564,permute_242,call_function,permute.default,forward,22,1,1,1,2,3663,3 -2565,alias_default_624,call_function,alias.default,forward,22,1,1,6,2442,3676,4 -2566,alias_default_625,call_function,alias.default,forward,22,1,1,2,3,3662,3 -2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 -2568,dtype_cast_201,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3 -2569,permute_243,call_function,permute.default,forward,22,1,1,1,2,3663,3 -2570,alias_default_626,call_function,alias.default,forward,22,1,1,2,3,3662,3 -2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 -2572,dtype_cast_202,call_function,dtype_cast.default,forward,22,1,1,1,1,3657,3 -2573,permute_244,call_function,permute.default,forward,22,1,1,1,2,3656,3 -2574,alias_default_627,call_function,alias.default,forward,22,1,1,2,3,3655,3 -2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5 -2576,view_512,call_function,view.default,forward,22,1,1,1,2448,3659,4 -2577,view_513,call_function,view.default,forward,22,1,1,1,2448,3659,4 -2578,view_514,call_function,view.default,forward,22,1,1,1,2448,3652,4 -2579,convert_element_type_536,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4 -2580,view_515,call_function,view.default,forward,22,1,1,1,2450,3657,4 -2581,view_as_complex_44,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6 -2582,convert_element_type_537,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4 -2583,view_516,call_function,view.default,forward,22,1,1,1,2450,3657,4 -2584,view_as_complex_45,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6 -2585,view_517,call_function,view.default,forward,22,1,1,1,2,3667,3 -2586,alias_default_628,call_function,alias.default,forward,22,1,1,4,3,3666,3 -2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 -2588,view_as_real_44,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6 -2589,view_518,call_function,view.default,forward,22,1,1,1,2456,3653,6 -2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 -2591,view_as_real_45,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6 -2592,view_519,call_function,view.default,forward,22,1,1,1,2456,3653,6 -2593,convert_element_type_538,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6 -2594,convert_element_type_539,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6 -2595,permute_245,call_function,permute.default,forward,22,1,1,1,2458,3651,6 -2596,permute_246,call_function,permute.default,forward,22,1,1,1,2458,3651,6 -2597,permute_247,call_function,permute.default,forward,22,1,1,1,2449,3651,4 -2598,alias_default_629,call_function,alias.default,forward,22,1,1,2,2459,3650,4 -2599,alias_default_630,call_function,alias.default,forward,22,1,1,2,2459,3650,4 -2600,alias_default_631,call_function,alias.default,forward,22,1,1,2,2450,3650,4 -2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2 -2602,getitem_198,call_function,getitem,forward,22,1,1,1,2484,3645,2 -2603,getitem_199,call_function,getitem,forward,22,1,1,1,2484,2484,2 -2604,getitem_204,call_function,getitem,forward,22,1,1,1,2484,2484,1 -2605,getitem_205,call_function,getitem,forward,22,1,1,1,2484,2484,1 -2606,alias_default_632,call_function,alias.default,forward,22,1,1,2,2485,3644,4 -2607,permute_248,call_function,permute.default,forward,22,1,1,1,2486,3643,4 -2608,view_520,call_function,view.default,forward,22,1,1,1,2487,3642,3 -2609,dtype_cast_203,call_function,dtype_cast.default,forward,22,1,1,1,1,3644,3 -2610,permute_249,call_function,permute.default,forward,22,1,1,1,2,3643,3 -2611,alias_default_633,call_function,alias.default,forward,22,1,1,2,2488,3641,4 -2612,alias_default_634,call_function,alias.default,forward,22,1,1,2,3,3642,3 -2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5 -2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10 -2615,dtype_cast_204,call_function,dtype_cast.default,forward,22,1,1,1,1,3628,2 -2616,alias_default_635,call_function,alias.default,forward,22,1,1,3,2495,3638,4 -2617,convert_element_type_542,call_function,convert_element_type.default,forward,22,1,1,1,2496,3636,4 -2618,alias_default_637,call_function,alias.default,forward,22,1,1,2,2497,3635,4 -2619,pow_46,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2498,3634,4 -2620,mean_45,call_function,mean.dim,forward,22,1,1,1,2499,3633,4 -2621,add_112,call_function,add.Scalar,forward,22,1,1,1,2500,3632,3 -2622,rsqrt_45,call_function,rsqrt.default,forward,22,1,1,1,2501,3631,3 -2623,alias_default_638,call_function,alias.default,forward,22,1,1,3,2502,3630,3 -2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8 -2625,alias_default_636,call_function,alias.default,forward,22,1,1,2,2,3627,2 -2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8 -2627,convert_element_type_543,call_function,convert_element_type.default,forward,22,1,1,1,2508,3624,6 -2628,dtype_cast_205,call_function,dtype_cast.default,forward,22,1,1,1,1,3624,3 -2629,permute_250,call_function,permute.default,forward,22,1,1,1,2,3623,3 -2630,alias_default_639,call_function,alias.default,forward,22,1,1,4,2509,3623,4 -2631,alias_default_640,call_function,alias.default,forward,22,1,1,2,3,3622,3 -2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5 -2633,alias_default_641,call_function,alias.default,forward,22,1,1,2,2515,3619,4 -2634,convert_element_type_546,call_function,convert_element_type.default,forward,22,1,1,1,2516,3607,4 -2635,alias_default_642,call_function,alias.default,forward,22,1,1,2,2517,3606,4 -2636,neg_22,call_function,neg.default,forward,22,1,1,1,2518,3605,8 -2637,exp_22,call_function,exp.default,forward,22,1,1,1,2519,3604,6 -2638,add_113,call_function,add.Tensor,forward,22,1,1,1,2520,3603,4 -2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6 -2640,convert_element_type_547,call_function,convert_element_type.default,forward,22,1,1,1,2522,3601,6 -2641,dtype_cast_206,call_function,dtype_cast.default,forward,22,1,1,1,1,3605,3 -2642,permute_251,call_function,permute.default,forward,22,1,1,1,2,3604,3 -2643,alias_default_644,call_function,alias.default,forward,22,1,1,2,3,3603,3 -2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5 -2645,alias_default_643,call_function,alias.default,forward,22,1,1,2,2523,3600,4 -2646,alias_default_645,call_function,alias.default,forward,22,1,1,2,2515,3600,4 -2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8 -2648,dtype_cast_207,call_function,dtype_cast.default,forward,22,1,1,1,1,3601,3 -2649,permute_252,call_function,permute.default,forward,22,1,1,1,2,3600,3 -2650,alias_default_646,call_function,alias.default,forward,22,1,1,2,2531,3598,4 -2651,alias_default_647,call_function,alias.default,forward,22,1,1,2,3,3599,3 -2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5 -2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10 -2654,dtype_cast_208,call_function,dtype_cast.default,forward,23,1,1,1,1,3585,2 -2655,alias_default_648,call_function,alias.default,forward,22,1,1,3,2538,3595,4 -2656,convert_element_type_552,call_function,convert_element_type.default,forward,23,1,1,1,2539,3593,4 -2657,alias_default_650,call_function,alias.default,forward,23,1,1,2,2540,3592,4 -2658,pow_47,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2541,3591,4 -2659,mean_46,call_function,mean.dim,forward,23,1,1,1,2542,3590,4 -2660,add_115,call_function,add.Scalar,forward,23,1,1,1,2543,3589,3 -2661,rsqrt_46,call_function,rsqrt.default,forward,23,1,1,1,2544,3588,3 -2662,alias_default_651,call_function,alias.default,forward,23,1,1,3,2545,3587,3 -2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8 -2664,alias_default_649,call_function,alias.default,forward,23,1,1,2,2,3584,2 -2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8 -2666,convert_element_type_553,call_function,convert_element_type.default,forward,23,1,1,1,2551,3581,6 -2667,dtype_cast_209,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3 -2668,permute_253,call_function,permute.default,forward,23,1,1,1,2,3567,3 -2669,alias_default_652,call_function,alias.default,forward,23,1,1,6,2552,3580,4 -2670,alias_default_653,call_function,alias.default,forward,23,1,1,2,3,3566,3 -2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 -2672,dtype_cast_210,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3 -2673,permute_254,call_function,permute.default,forward,23,1,1,1,2,3567,3 -2674,alias_default_654,call_function,alias.default,forward,23,1,1,2,3,3566,3 -2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 -2676,dtype_cast_211,call_function,dtype_cast.default,forward,23,1,1,1,1,3561,3 -2677,permute_255,call_function,permute.default,forward,23,1,1,1,2,3560,3 -2678,alias_default_655,call_function,alias.default,forward,23,1,1,2,3,3559,3 -2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5 -2680,view_535,call_function,view.default,forward,23,1,1,1,2558,3563,4 -2681,view_536,call_function,view.default,forward,23,1,1,1,2558,3563,4 -2682,view_537,call_function,view.default,forward,23,1,1,1,2558,3556,4 -2683,convert_element_type_560,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4 -2684,view_538,call_function,view.default,forward,23,1,1,1,2560,3561,4 -2685,view_as_complex_46,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6 -2686,convert_element_type_561,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4 -2687,view_539,call_function,view.default,forward,23,1,1,1,2560,3561,4 -2688,view_as_complex_47,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6 -2689,view_540,call_function,view.default,forward,23,1,1,1,2,3571,3 -2690,alias_default_656,call_function,alias.default,forward,23,1,1,4,3,3570,3 -2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 -2692,view_as_real_46,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6 -2693,view_541,call_function,view.default,forward,23,1,1,1,2566,3557,6 -2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 -2695,view_as_real_47,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6 -2696,view_542,call_function,view.default,forward,23,1,1,1,2566,3557,6 -2697,convert_element_type_562,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6 -2698,convert_element_type_563,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6 -2699,permute_256,call_function,permute.default,forward,23,1,1,1,2568,3555,6 -2700,permute_257,call_function,permute.default,forward,23,1,1,1,2568,3555,6 -2701,permute_258,call_function,permute.default,forward,23,1,1,1,2559,3555,4 -2702,alias_default_657,call_function,alias.default,forward,23,1,1,2,2569,3554,4 -2703,alias_default_658,call_function,alias.default,forward,23,1,1,2,2569,3554,4 -2704,alias_default_659,call_function,alias.default,forward,23,1,1,2,2560,3554,4 -2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2 -2706,getitem_207,call_function,getitem,forward,23,1,1,1,2594,3549,2 -2707,getitem_208,call_function,getitem,forward,23,1,1,1,2594,2594,2 -2708,getitem_213,call_function,getitem,forward,23,1,1,1,2594,2594,1 -2709,getitem_214,call_function,getitem,forward,23,1,1,1,2594,2594,1 -2710,alias_default_660,call_function,alias.default,forward,23,1,1,2,2595,3548,4 -2711,permute_259,call_function,permute.default,forward,23,1,1,1,2596,3547,4 -2712,view_543,call_function,view.default,forward,23,1,1,1,2597,3546,3 -2713,dtype_cast_212,call_function,dtype_cast.default,forward,23,1,1,1,1,3548,3 -2714,permute_260,call_function,permute.default,forward,23,1,1,1,2,3547,3 -2715,alias_default_661,call_function,alias.default,forward,23,1,1,2,2598,3545,4 -2716,alias_default_662,call_function,alias.default,forward,23,1,1,2,3,3546,3 -2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5 -2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10 -2719,dtype_cast_213,call_function,dtype_cast.default,forward,23,1,1,1,1,3532,2 -2720,alias_default_663,call_function,alias.default,forward,23,1,1,3,2605,3542,4 -2721,convert_element_type_566,call_function,convert_element_type.default,forward,23,1,1,1,2606,3540,4 -2722,alias_default_665,call_function,alias.default,forward,23,1,1,2,2607,3539,4 -2723,pow_48,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2608,3538,4 -2724,mean_47,call_function,mean.dim,forward,23,1,1,1,2609,3537,4 -2725,add_117,call_function,add.Scalar,forward,23,1,1,1,2610,3536,3 -2726,rsqrt_47,call_function,rsqrt.default,forward,23,1,1,1,2611,3535,3 -2727,alias_default_666,call_function,alias.default,forward,23,1,1,3,2612,3534,3 -2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8 -2729,alias_default_664,call_function,alias.default,forward,23,1,1,2,2,3531,2 -2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8 -2731,convert_element_type_567,call_function,convert_element_type.default,forward,23,1,1,1,2618,3528,6 -2732,dtype_cast_214,call_function,dtype_cast.default,forward,23,1,1,1,1,3528,3 -2733,permute_261,call_function,permute.default,forward,23,1,1,1,2,3527,3 -2734,alias_default_667,call_function,alias.default,forward,23,1,1,4,2619,3527,4 -2735,alias_default_668,call_function,alias.default,forward,23,1,1,2,3,3526,3 -2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5 -2737,alias_default_669,call_function,alias.default,forward,23,1,1,2,2625,3523,4 -2738,convert_element_type_570,call_function,convert_element_type.default,forward,23,1,1,1,2626,3511,4 -2739,alias_default_670,call_function,alias.default,forward,23,1,1,2,2627,3510,4 -2740,neg_23,call_function,neg.default,forward,23,1,1,1,2628,3509,8 -2741,exp_23,call_function,exp.default,forward,23,1,1,1,2629,3508,6 -2742,add_118,call_function,add.Tensor,forward,23,1,1,1,2630,3507,4 -2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6 -2744,convert_element_type_571,call_function,convert_element_type.default,forward,23,1,1,1,2632,3505,6 -2745,dtype_cast_215,call_function,dtype_cast.default,forward,23,1,1,1,1,3509,3 -2746,permute_262,call_function,permute.default,forward,23,1,1,1,2,3508,3 -2747,alias_default_672,call_function,alias.default,forward,23,1,1,2,3,3507,3 -2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5 -2749,alias_default_671,call_function,alias.default,forward,23,1,1,2,2633,3504,4 -2750,alias_default_673,call_function,alias.default,forward,23,1,1,2,2625,3504,4 -2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8 -2752,dtype_cast_216,call_function,dtype_cast.default,forward,23,1,1,1,1,3505,3 -2753,permute_263,call_function,permute.default,forward,23,1,1,1,2,3504,3 -2754,alias_default_674,call_function,alias.default,forward,23,1,1,2,2641,3502,4 -2755,alias_default_675,call_function,alias.default,forward,23,1,1,2,3,3503,3 -2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5 -2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10 -2758,dtype_cast_217,call_function,dtype_cast.default,forward,24,1,1,1,1,3489,2 -2759,alias_default_676,call_function,alias.default,forward,23,1,1,3,2648,3499,4 -2760,convert_element_type_576,call_function,convert_element_type.default,forward,24,1,1,1,2649,3497,4 -2761,alias_default_678,call_function,alias.default,forward,24,1,1,2,2650,3496,4 -2762,pow_49,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2651,3495,4 -2763,mean_48,call_function,mean.dim,forward,24,1,1,1,2652,3494,4 -2764,add_120,call_function,add.Scalar,forward,24,1,1,1,2653,3493,3 -2765,rsqrt_48,call_function,rsqrt.default,forward,24,1,1,1,2654,3492,3 -2766,alias_default_679,call_function,alias.default,forward,24,1,1,3,2655,3491,3 -2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8 -2768,alias_default_677,call_function,alias.default,forward,24,1,1,2,2,3488,2 -2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8 -2770,convert_element_type_577,call_function,convert_element_type.default,forward,24,1,1,1,2661,3485,6 -2771,dtype_cast_218,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3 -2772,permute_264,call_function,permute.default,forward,24,1,1,1,2,3471,3 -2773,alias_default_680,call_function,alias.default,forward,24,1,1,6,2662,3484,4 -2774,alias_default_681,call_function,alias.default,forward,24,1,1,2,3,3470,3 -2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 -2776,dtype_cast_219,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3 -2777,permute_265,call_function,permute.default,forward,24,1,1,1,2,3471,3 -2778,alias_default_682,call_function,alias.default,forward,24,1,1,2,3,3470,3 -2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 -2780,dtype_cast_220,call_function,dtype_cast.default,forward,24,1,1,1,1,3465,3 -2781,permute_266,call_function,permute.default,forward,24,1,1,1,2,3464,3 -2782,alias_default_683,call_function,alias.default,forward,24,1,1,2,3,3463,3 -2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5 -2784,view_558,call_function,view.default,forward,24,1,1,1,2668,3467,4 -2785,view_559,call_function,view.default,forward,24,1,1,1,2668,3467,4 -2786,view_560,call_function,view.default,forward,24,1,1,1,2668,3460,4 -2787,convert_element_type_584,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4 -2788,view_561,call_function,view.default,forward,24,1,1,1,2670,3465,4 -2789,view_as_complex_48,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6 -2790,convert_element_type_585,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4 -2791,view_562,call_function,view.default,forward,24,1,1,1,2670,3465,4 -2792,view_as_complex_49,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6 -2793,view_563,call_function,view.default,forward,24,1,1,1,2,3475,3 -2794,alias_default_684,call_function,alias.default,forward,24,1,1,4,3,3474,3 -2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 -2796,view_as_real_48,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6 -2797,view_564,call_function,view.default,forward,24,1,1,1,2676,3461,6 -2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 -2799,view_as_real_49,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6 -2800,view_565,call_function,view.default,forward,24,1,1,1,2676,3461,6 -2801,convert_element_type_586,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6 -2802,convert_element_type_587,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6 -2803,permute_267,call_function,permute.default,forward,24,1,1,1,2678,3459,6 -2804,permute_268,call_function,permute.default,forward,24,1,1,1,2678,3459,6 -2805,permute_269,call_function,permute.default,forward,24,1,1,1,2669,3459,4 -2806,alias_default_685,call_function,alias.default,forward,24,1,1,2,2679,3458,4 -2807,alias_default_686,call_function,alias.default,forward,24,1,1,2,2679,3458,4 -2808,alias_default_687,call_function,alias.default,forward,24,1,1,2,2670,3458,4 -2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2 -2810,getitem_216,call_function,getitem,forward,24,1,1,1,2704,3453,2 -2811,getitem_217,call_function,getitem,forward,24,1,1,1,2704,2704,2 -2812,getitem_222,call_function,getitem,forward,24,1,1,1,2704,2704,1 -2813,getitem_223,call_function,getitem,forward,24,1,1,1,2704,2704,1 -2814,alias_default_688,call_function,alias.default,forward,24,1,1,2,2705,3452,4 -2815,permute_270,call_function,permute.default,forward,24,1,1,1,2706,3451,4 -2816,view_566,call_function,view.default,forward,24,1,1,1,2707,3450,3 -2817,dtype_cast_221,call_function,dtype_cast.default,forward,24,1,1,1,1,3452,3 -2818,permute_271,call_function,permute.default,forward,24,1,1,1,2,3451,3 -2819,alias_default_689,call_function,alias.default,forward,24,1,1,2,2708,3449,4 -2820,alias_default_690,call_function,alias.default,forward,24,1,1,2,3,3450,3 -2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5 -2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10 -2823,dtype_cast_222,call_function,dtype_cast.default,forward,24,1,1,1,1,3436,2 -2824,alias_default_691,call_function,alias.default,forward,24,1,1,3,2715,3446,4 -2825,convert_element_type_590,call_function,convert_element_type.default,forward,24,1,1,1,2716,3444,4 -2826,alias_default_693,call_function,alias.default,forward,24,1,1,2,2717,3443,4 -2827,pow_50,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2718,3442,4 -2828,mean_49,call_function,mean.dim,forward,24,1,1,1,2719,3441,4 -2829,add_122,call_function,add.Scalar,forward,24,1,1,1,2720,3440,3 -2830,rsqrt_49,call_function,rsqrt.default,forward,24,1,1,1,2721,3439,3 -2831,alias_default_694,call_function,alias.default,forward,24,1,1,3,2722,3438,3 -2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8 -2833,alias_default_692,call_function,alias.default,forward,24,1,1,2,2,3435,2 -2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8 -2835,convert_element_type_591,call_function,convert_element_type.default,forward,24,1,1,1,2728,3432,6 -2836,dtype_cast_223,call_function,dtype_cast.default,forward,24,1,1,1,1,3432,3 -2837,permute_272,call_function,permute.default,forward,24,1,1,1,2,3431,3 -2838,alias_default_695,call_function,alias.default,forward,24,1,1,4,2729,3431,4 -2839,alias_default_696,call_function,alias.default,forward,24,1,1,2,3,3430,3 -2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5 -2841,alias_default_697,call_function,alias.default,forward,24,1,1,2,2735,3427,4 -2842,convert_element_type_594,call_function,convert_element_type.default,forward,24,1,1,1,2736,3415,4 -2843,alias_default_698,call_function,alias.default,forward,24,1,1,2,2737,3414,4 -2844,neg_24,call_function,neg.default,forward,24,1,1,1,2738,3413,8 -2845,exp_24,call_function,exp.default,forward,24,1,1,1,2739,3412,6 -2846,add_123,call_function,add.Tensor,forward,24,1,1,1,2740,3411,4 -2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6 -2848,convert_element_type_595,call_function,convert_element_type.default,forward,24,1,1,1,2742,3409,6 -2849,dtype_cast_224,call_function,dtype_cast.default,forward,24,1,1,1,1,3413,3 -2850,permute_273,call_function,permute.default,forward,24,1,1,1,2,3412,3 -2851,alias_default_700,call_function,alias.default,forward,24,1,1,2,3,3411,3 -2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5 -2853,alias_default_699,call_function,alias.default,forward,24,1,1,2,2743,3408,4 -2854,alias_default_701,call_function,alias.default,forward,24,1,1,2,2735,3408,4 -2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8 -2856,dtype_cast_225,call_function,dtype_cast.default,forward,24,1,1,1,1,3409,3 -2857,permute_274,call_function,permute.default,forward,24,1,1,1,2,3408,3 -2858,alias_default_702,call_function,alias.default,forward,24,1,1,2,2751,3406,4 -2859,alias_default_703,call_function,alias.default,forward,24,1,1,2,3,3407,3 -2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5 -2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10 -2862,dtype_cast_226,call_function,dtype_cast.default,forward,25,1,1,1,1,3393,2 -2863,alias_default_704,call_function,alias.default,forward,24,1,1,3,2758,3403,4 -2864,convert_element_type_600,call_function,convert_element_type.default,forward,25,1,1,1,2759,3401,4 -2865,alias_default_706,call_function,alias.default,forward,25,1,1,2,2760,3400,4 -2866,pow_51,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2761,3399,4 -2867,mean_50,call_function,mean.dim,forward,25,1,1,1,2762,3398,4 -2868,add_125,call_function,add.Scalar,forward,25,1,1,1,2763,3397,3 -2869,rsqrt_50,call_function,rsqrt.default,forward,25,1,1,1,2764,3396,3 -2870,alias_default_707,call_function,alias.default,forward,25,1,1,3,2765,3395,3 -2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8 -2872,alias_default_705,call_function,alias.default,forward,25,1,1,2,2,3392,2 -2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8 -2874,convert_element_type_601,call_function,convert_element_type.default,forward,25,1,1,1,2771,3389,6 -2875,dtype_cast_227,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3 -2876,permute_275,call_function,permute.default,forward,25,1,1,1,2,3375,3 -2877,alias_default_708,call_function,alias.default,forward,25,1,1,6,2772,3388,4 -2878,alias_default_709,call_function,alias.default,forward,25,1,1,2,3,3374,3 -2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 -2880,dtype_cast_228,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3 -2881,permute_276,call_function,permute.default,forward,25,1,1,1,2,3375,3 -2882,alias_default_710,call_function,alias.default,forward,25,1,1,2,3,3374,3 -2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 -2884,dtype_cast_229,call_function,dtype_cast.default,forward,25,1,1,1,1,3369,3 -2885,permute_277,call_function,permute.default,forward,25,1,1,1,2,3368,3 -2886,alias_default_711,call_function,alias.default,forward,25,1,1,2,3,3367,3 -2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5 -2888,view_581,call_function,view.default,forward,25,1,1,1,2778,3371,4 -2889,view_582,call_function,view.default,forward,25,1,1,1,2778,3371,4 -2890,view_583,call_function,view.default,forward,25,1,1,1,2778,3364,4 -2891,convert_element_type_608,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4 -2892,view_584,call_function,view.default,forward,25,1,1,1,2780,3369,4 -2893,view_as_complex_50,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6 -2894,convert_element_type_609,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4 -2895,view_585,call_function,view.default,forward,25,1,1,1,2780,3369,4 -2896,view_as_complex_51,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6 -2897,view_586,call_function,view.default,forward,25,1,1,1,2,3379,3 -2898,alias_default_712,call_function,alias.default,forward,25,1,1,4,3,3378,3 -2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 -2900,view_as_real_50,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6 -2901,view_587,call_function,view.default,forward,25,1,1,1,2786,3365,6 -2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 -2903,view_as_real_51,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6 -2904,view_588,call_function,view.default,forward,25,1,1,1,2786,3365,6 -2905,convert_element_type_610,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6 -2906,convert_element_type_611,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6 -2907,permute_278,call_function,permute.default,forward,25,1,1,1,2788,3363,6 -2908,permute_279,call_function,permute.default,forward,25,1,1,1,2788,3363,6 -2909,permute_280,call_function,permute.default,forward,25,1,1,1,2779,3363,4 -2910,alias_default_713,call_function,alias.default,forward,25,1,1,2,2789,3362,4 -2911,alias_default_714,call_function,alias.default,forward,25,1,1,2,2789,3362,4 -2912,alias_default_715,call_function,alias.default,forward,25,1,1,2,2780,3362,4 -2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2 -2914,getitem_225,call_function,getitem,forward,25,1,1,1,2814,3357,2 -2915,getitem_226,call_function,getitem,forward,25,1,1,1,2814,2814,2 -2916,getitem_231,call_function,getitem,forward,25,1,1,1,2814,2814,1 -2917,getitem_232,call_function,getitem,forward,25,1,1,1,2814,2814,1 -2918,alias_default_716,call_function,alias.default,forward,25,1,1,2,2815,3356,4 -2919,permute_281,call_function,permute.default,forward,25,1,1,1,2816,3355,4 -2920,view_589,call_function,view.default,forward,25,1,1,1,2817,3354,3 -2921,dtype_cast_230,call_function,dtype_cast.default,forward,25,1,1,1,1,3356,3 -2922,permute_282,call_function,permute.default,forward,25,1,1,1,2,3355,3 -2923,alias_default_717,call_function,alias.default,forward,25,1,1,2,2818,3353,4 -2924,alias_default_718,call_function,alias.default,forward,25,1,1,2,3,3354,3 -2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5 -2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10 -2927,dtype_cast_231,call_function,dtype_cast.default,forward,25,1,1,1,1,3340,2 -2928,alias_default_719,call_function,alias.default,forward,25,1,1,3,2825,3350,4 -2929,convert_element_type_614,call_function,convert_element_type.default,forward,25,1,1,1,2826,3348,4 -2930,alias_default_721,call_function,alias.default,forward,25,1,1,2,2827,3347,4 -2931,pow_52,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2828,3346,4 -2932,mean_51,call_function,mean.dim,forward,25,1,1,1,2829,3345,4 -2933,add_127,call_function,add.Scalar,forward,25,1,1,1,2830,3344,3 -2934,rsqrt_51,call_function,rsqrt.default,forward,25,1,1,1,2831,3343,3 -2935,alias_default_722,call_function,alias.default,forward,25,1,1,3,2832,3342,3 -2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8 -2937,alias_default_720,call_function,alias.default,forward,25,1,1,2,2,3339,2 -2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8 -2939,convert_element_type_615,call_function,convert_element_type.default,forward,25,1,1,1,2838,3336,6 -2940,dtype_cast_232,call_function,dtype_cast.default,forward,25,1,1,1,1,3336,3 -2941,permute_283,call_function,permute.default,forward,25,1,1,1,2,3335,3 -2942,alias_default_723,call_function,alias.default,forward,25,1,1,4,2839,3335,4 -2943,alias_default_724,call_function,alias.default,forward,25,1,1,2,3,3334,3 -2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5 -2945,alias_default_725,call_function,alias.default,forward,25,1,1,2,2845,3331,4 -2946,convert_element_type_618,call_function,convert_element_type.default,forward,25,1,1,1,2846,3319,4 -2947,alias_default_726,call_function,alias.default,forward,25,1,1,2,2847,3318,4 -2948,neg_25,call_function,neg.default,forward,25,1,1,1,2848,3317,8 -2949,exp_25,call_function,exp.default,forward,25,1,1,1,2849,3316,6 -2950,add_128,call_function,add.Tensor,forward,25,1,1,1,2850,3315,4 -2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6 -2952,convert_element_type_619,call_function,convert_element_type.default,forward,25,1,1,1,2852,3313,6 -2953,dtype_cast_233,call_function,dtype_cast.default,forward,25,1,1,1,1,3317,3 -2954,permute_284,call_function,permute.default,forward,25,1,1,1,2,3316,3 -2955,alias_default_728,call_function,alias.default,forward,25,1,1,2,3,3315,3 -2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5 -2957,alias_default_727,call_function,alias.default,forward,25,1,1,2,2853,3312,4 -2958,alias_default_729,call_function,alias.default,forward,25,1,1,2,2845,3312,4 -2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8 -2960,dtype_cast_234,call_function,dtype_cast.default,forward,25,1,1,1,1,3313,3 -2961,permute_285,call_function,permute.default,forward,25,1,1,1,2,3312,3 -2962,alias_default_730,call_function,alias.default,forward,25,1,1,2,2861,3310,4 -2963,alias_default_731,call_function,alias.default,forward,25,1,1,2,3,3311,3 -2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5 -2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10 -2966,dtype_cast_235,call_function,dtype_cast.default,forward,26,1,1,1,1,3297,2 -2967,alias_default_732,call_function,alias.default,forward,25,1,1,3,2868,3307,4 -2968,convert_element_type_624,call_function,convert_element_type.default,forward,26,1,1,1,2869,3305,4 -2969,alias_default_734,call_function,alias.default,forward,26,1,1,2,2870,3304,4 -2970,pow_53,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2871,3303,4 -2971,mean_52,call_function,mean.dim,forward,26,1,1,1,2872,3302,4 -2972,add_130,call_function,add.Scalar,forward,26,1,1,1,2873,3301,3 -2973,rsqrt_52,call_function,rsqrt.default,forward,26,1,1,1,2874,3300,3 -2974,alias_default_735,call_function,alias.default,forward,26,1,1,3,2875,3299,3 -2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8 -2976,alias_default_733,call_function,alias.default,forward,26,1,1,2,2,3296,2 -2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8 -2978,convert_element_type_625,call_function,convert_element_type.default,forward,26,1,1,1,2881,3293,6 -2979,dtype_cast_236,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3 -2980,permute_286,call_function,permute.default,forward,26,1,1,1,2,3279,3 -2981,alias_default_736,call_function,alias.default,forward,26,1,1,6,2882,3292,4 -2982,alias_default_737,call_function,alias.default,forward,26,1,1,2,3,3278,3 -2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 -2984,dtype_cast_237,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3 -2985,permute_287,call_function,permute.default,forward,26,1,1,1,2,3279,3 -2986,alias_default_738,call_function,alias.default,forward,26,1,1,2,3,3278,3 -2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 -2988,dtype_cast_238,call_function,dtype_cast.default,forward,26,1,1,1,1,3273,3 -2989,permute_288,call_function,permute.default,forward,26,1,1,1,2,3272,3 -2990,alias_default_739,call_function,alias.default,forward,26,1,1,2,3,3271,3 -2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5 -2992,view_604,call_function,view.default,forward,26,1,1,1,2888,3275,4 -2993,view_605,call_function,view.default,forward,26,1,1,1,2888,3275,4 -2994,view_606,call_function,view.default,forward,26,1,1,1,2888,3268,4 -2995,convert_element_type_632,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4 -2996,view_607,call_function,view.default,forward,26,1,1,1,2890,3273,4 -2997,view_as_complex_52,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6 -2998,convert_element_type_633,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4 -2999,view_608,call_function,view.default,forward,26,1,1,1,2890,3273,4 -3000,view_as_complex_53,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6 -3001,view_609,call_function,view.default,forward,26,1,1,1,2,3283,3 -3002,alias_default_740,call_function,alias.default,forward,26,1,1,4,3,3282,3 -3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 -3004,view_as_real_52,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6 -3005,view_610,call_function,view.default,forward,26,1,1,1,2896,3269,6 -3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 -3007,view_as_real_53,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6 -3008,view_611,call_function,view.default,forward,26,1,1,1,2896,3269,6 -3009,convert_element_type_634,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6 -3010,convert_element_type_635,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6 -3011,permute_289,call_function,permute.default,forward,26,1,1,1,2898,3267,6 -3012,permute_290,call_function,permute.default,forward,26,1,1,1,2898,3267,6 -3013,permute_291,call_function,permute.default,forward,26,1,1,1,2889,3267,4 -3014,alias_default_741,call_function,alias.default,forward,26,1,1,2,2899,3266,4 -3015,alias_default_742,call_function,alias.default,forward,26,1,1,2,2899,3266,4 -3016,alias_default_743,call_function,alias.default,forward,26,1,1,2,2890,3266,4 -3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2 -3018,getitem_234,call_function,getitem,forward,26,1,1,1,2924,3261,2 -3019,getitem_235,call_function,getitem,forward,26,1,1,1,2924,2924,2 -3020,getitem_240,call_function,getitem,forward,26,1,1,1,2924,2924,1 -3021,getitem_241,call_function,getitem,forward,26,1,1,1,2924,2924,1 -3022,alias_default_744,call_function,alias.default,forward,26,1,1,2,2925,3260,4 -3023,permute_292,call_function,permute.default,forward,26,1,1,1,2926,3259,4 -3024,view_612,call_function,view.default,forward,26,1,1,1,2927,3258,3 -3025,dtype_cast_239,call_function,dtype_cast.default,forward,26,1,1,1,1,3260,3 -3026,permute_293,call_function,permute.default,forward,26,1,1,1,2,3259,3 -3027,alias_default_745,call_function,alias.default,forward,26,1,1,2,2928,3257,4 -3028,alias_default_746,call_function,alias.default,forward,26,1,1,2,3,3258,3 -3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5 -3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10 -3031,dtype_cast_240,call_function,dtype_cast.default,forward,26,1,1,1,1,3244,2 -3032,alias_default_747,call_function,alias.default,forward,26,1,1,3,2935,3254,4 -3033,convert_element_type_638,call_function,convert_element_type.default,forward,26,1,1,1,2936,3252,4 -3034,alias_default_749,call_function,alias.default,forward,26,1,1,2,2937,3251,4 -3035,pow_54,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2938,3250,4 -3036,mean_53,call_function,mean.dim,forward,26,1,1,1,2939,3249,4 -3037,add_132,call_function,add.Scalar,forward,26,1,1,1,2940,3248,3 -3038,rsqrt_53,call_function,rsqrt.default,forward,26,1,1,1,2941,3247,3 -3039,alias_default_750,call_function,alias.default,forward,26,1,1,3,2942,3246,3 -3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8 -3041,alias_default_748,call_function,alias.default,forward,26,1,1,2,2,3243,2 -3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8 -3043,convert_element_type_639,call_function,convert_element_type.default,forward,26,1,1,1,2948,3240,6 -3044,dtype_cast_241,call_function,dtype_cast.default,forward,26,1,1,1,1,3240,3 -3045,permute_294,call_function,permute.default,forward,26,1,1,1,2,3239,3 -3046,alias_default_751,call_function,alias.default,forward,26,1,1,4,2949,3239,4 -3047,alias_default_752,call_function,alias.default,forward,26,1,1,2,3,3238,3 -3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5 -3049,alias_default_753,call_function,alias.default,forward,26,1,1,2,2955,3235,4 -3050,convert_element_type_642,call_function,convert_element_type.default,forward,26,1,1,1,2956,3223,4 -3051,alias_default_754,call_function,alias.default,forward,26,1,1,2,2957,3222,4 -3052,neg_26,call_function,neg.default,forward,26,1,1,1,2958,3221,8 -3053,exp_26,call_function,exp.default,forward,26,1,1,1,2959,3220,6 -3054,add_133,call_function,add.Tensor,forward,26,1,1,1,2960,3219,4 -3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6 -3056,convert_element_type_643,call_function,convert_element_type.default,forward,26,1,1,1,2962,3217,6 -3057,dtype_cast_242,call_function,dtype_cast.default,forward,26,1,1,1,1,3221,3 -3058,permute_295,call_function,permute.default,forward,26,1,1,1,2,3220,3 -3059,alias_default_756,call_function,alias.default,forward,26,1,1,2,3,3219,3 -3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5 -3061,alias_default_755,call_function,alias.default,forward,26,1,1,2,2963,3216,4 -3062,alias_default_757,call_function,alias.default,forward,26,1,1,2,2955,3216,4 -3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8 -3064,dtype_cast_243,call_function,dtype_cast.default,forward,26,1,1,1,1,3217,3 -3065,permute_296,call_function,permute.default,forward,26,1,1,1,2,3216,3 -3066,alias_default_758,call_function,alias.default,forward,26,1,1,2,2971,3214,4 -3067,alias_default_759,call_function,alias.default,forward,26,1,1,2,3,3215,3 -3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5 -3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10 -3070,dtype_cast_244,call_function,dtype_cast.default,forward,27,1,1,1,1,3201,2 -3071,alias_default_760,call_function,alias.default,forward,26,1,1,3,2978,3211,4 -3072,convert_element_type_648,call_function,convert_element_type.default,forward,27,1,1,1,2979,3209,4 -3073,alias_default_762,call_function,alias.default,forward,27,1,1,2,2980,3208,4 -3074,pow_55,call_function,pow.Tensor_Scalar,forward,27,1,1,1,2981,3207,4 -3075,mean_54,call_function,mean.dim,forward,27,1,1,1,2982,3206,4 -3076,add_135,call_function,add.Scalar,forward,27,1,1,1,2983,3205,3 -3077,rsqrt_54,call_function,rsqrt.default,forward,27,1,1,1,2984,3204,3 -3078,alias_default_763,call_function,alias.default,forward,27,1,1,3,2985,3203,3 -3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8 -3080,alias_default_761,call_function,alias.default,forward,27,1,1,2,2,3200,2 -3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8 -3082,convert_element_type_649,call_function,convert_element_type.default,forward,27,1,1,1,2991,3197,6 -3083,dtype_cast_245,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3 -3084,permute_297,call_function,permute.default,forward,27,1,1,1,2,3183,3 -3085,alias_default_764,call_function,alias.default,forward,27,1,1,6,2992,3196,4 -3086,alias_default_765,call_function,alias.default,forward,27,1,1,2,3,3182,3 -3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 -3088,dtype_cast_246,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3 -3089,permute_298,call_function,permute.default,forward,27,1,1,1,2,3183,3 -3090,alias_default_766,call_function,alias.default,forward,27,1,1,2,3,3182,3 -3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 -3092,dtype_cast_247,call_function,dtype_cast.default,forward,27,1,1,1,1,3177,3 -3093,permute_299,call_function,permute.default,forward,27,1,1,1,2,3176,3 -3094,alias_default_767,call_function,alias.default,forward,27,1,1,2,3,3175,3 -3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5 -3096,view_627,call_function,view.default,forward,27,1,1,1,2998,3179,4 -3097,view_628,call_function,view.default,forward,27,1,1,1,2998,3179,4 -3098,view_629,call_function,view.default,forward,27,1,1,1,2998,3172,4 -3099,convert_element_type_656,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4 -3100,view_630,call_function,view.default,forward,27,1,1,1,3000,3177,4 -3101,view_as_complex_54,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6 -3102,convert_element_type_657,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4 -3103,view_631,call_function,view.default,forward,27,1,1,1,3000,3177,4 -3104,view_as_complex_55,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6 -3105,view_632,call_function,view.default,forward,27,1,1,1,2,3187,3 -3106,alias_default_768,call_function,alias.default,forward,27,1,1,4,3,3186,3 -3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 -3108,view_as_real_54,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6 -3109,view_633,call_function,view.default,forward,27,1,1,1,3006,3173,6 -3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 -3111,view_as_real_55,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6 -3112,view_634,call_function,view.default,forward,27,1,1,1,3006,3173,6 -3113,convert_element_type_658,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6 -3114,convert_element_type_659,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6 -3115,permute_300,call_function,permute.default,forward,27,1,1,1,3008,3171,6 -3116,permute_301,call_function,permute.default,forward,27,1,1,1,3008,3171,6 -3117,permute_302,call_function,permute.default,forward,27,1,1,1,2999,3171,4 -3118,alias_default_769,call_function,alias.default,forward,27,1,1,2,3009,3170,4 -3119,alias_default_770,call_function,alias.default,forward,27,1,1,2,3009,3170,4 -3120,alias_default_771,call_function,alias.default,forward,27,1,1,2,3000,3170,4 -3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2 -3122,getitem_243,call_function,getitem,forward,27,1,1,1,3034,3165,2 -3123,getitem_244,call_function,getitem,forward,27,1,1,1,3034,3034,2 -3124,getitem_249,call_function,getitem,forward,27,1,1,1,3034,3034,1 -3125,getitem_250,call_function,getitem,forward,27,1,1,1,3034,3034,1 -3126,alias_default_772,call_function,alias.default,forward,27,1,1,2,3035,3164,4 -3127,permute_303,call_function,permute.default,forward,27,1,1,1,3036,3163,4 -3128,view_635,call_function,view.default,forward,27,1,1,1,3037,3162,3 -3129,dtype_cast_248,call_function,dtype_cast.default,forward,27,1,1,1,1,3164,3 -3130,permute_304,call_function,permute.default,forward,27,1,1,1,2,3163,3 -3131,alias_default_773,call_function,alias.default,forward,27,1,1,2,3038,3161,4 -3132,alias_default_774,call_function,alias.default,forward,27,1,1,2,3,3162,3 -3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5 -3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10 -3135,dtype_cast_249,call_function,dtype_cast.default,forward,27,1,1,1,1,3148,2 -3136,alias_default_775,call_function,alias.default,forward,27,1,1,3,3045,3158,4 -3137,convert_element_type_662,call_function,convert_element_type.default,forward,27,1,1,1,3046,3156,4 -3138,alias_default_777,call_function,alias.default,forward,27,1,1,2,3047,3155,4 -3139,pow_56,call_function,pow.Tensor_Scalar,forward,27,1,1,1,3048,3154,4 -3140,mean_55,call_function,mean.dim,forward,27,1,1,1,3049,3153,4 -3141,add_137,call_function,add.Scalar,forward,27,1,1,1,3050,3152,3 -3142,rsqrt_55,call_function,rsqrt.default,forward,27,1,1,1,3051,3151,3 -3143,alias_default_778,call_function,alias.default,forward,27,1,1,3,3052,3150,3 -3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8 -3145,alias_default_776,call_function,alias.default,forward,27,1,1,2,2,3147,2 -3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8 -3147,convert_element_type_663,call_function,convert_element_type.default,forward,27,1,1,1,3058,3144,6 -3148,dtype_cast_250,call_function,dtype_cast.default,forward,27,1,1,1,1,3144,3 -3149,permute_305,call_function,permute.default,forward,27,1,1,1,2,3143,3 -3150,alias_default_779,call_function,alias.default,forward,27,1,1,4,3059,3143,4 -3151,alias_default_780,call_function,alias.default,forward,27,1,1,2,3,3142,3 -3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5 -3153,alias_default_781,call_function,alias.default,forward,27,1,1,2,3065,3139,4 -3154,convert_element_type_666,call_function,convert_element_type.default,forward,27,1,1,1,3066,3127,4 -3155,alias_default_782,call_function,alias.default,forward,27,1,1,2,3067,3126,4 -3156,neg_27,call_function,neg.default,forward,27,1,1,1,3068,3125,8 -3157,exp_27,call_function,exp.default,forward,27,1,1,1,3069,3124,6 -3158,add_138,call_function,add.Tensor,forward,27,1,1,1,3070,3123,4 -3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6 -3160,convert_element_type_667,call_function,convert_element_type.default,forward,27,1,1,1,3072,3121,6 -3161,dtype_cast_251,call_function,dtype_cast.default,forward,27,1,1,1,1,3125,3 -3162,permute_306,call_function,permute.default,forward,27,1,1,1,2,3124,3 -3163,alias_default_784,call_function,alias.default,forward,27,1,1,2,3,3123,3 -3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5 -3165,alias_default_783,call_function,alias.default,forward,27,1,1,2,3073,3120,4 -3166,alias_default_785,call_function,alias.default,forward,27,1,1,2,3065,3120,4 -3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8 -3168,dtype_cast_252,call_function,dtype_cast.default,forward,27,1,1,1,1,3121,3 -3169,permute_307,call_function,permute.default,forward,27,1,1,1,2,3120,3 -3170,alias_default_786,call_function,alias.default,forward,27,1,1,2,3081,3118,4 -3171,alias_default_787,call_function,alias.default,forward,27,1,1,2,3,3119,3 -3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5 -3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10 -3174,dtype_cast_253,call_function,dtype_cast.default,forward,,1,1,1,1,3102,2 -3175,alias_default_788,call_function,alias.default,forward,27,1,1,2,3088,3115,4 -3176,convert_element_type_672,call_function,convert_element_type.default,forward,,1,1,1,3089,3113,4 -3177,alias_default_790,call_function,alias.default,forward,,1,1,2,3090,3112,4 -3178,pow_57,call_function,pow.Tensor_Scalar,forward,,1,1,1,3091,3111,4 -3179,mean_56,call_function,mean.dim,forward,,1,1,1,3092,3110,4 -3180,add_140,call_function,add.Scalar,forward,,1,1,1,3093,3109,3 -3181,rsqrt_56,call_function,rsqrt.default,forward,,1,1,1,3094,3108,3 -3182,alias_default_791,call_function,alias.default,forward,,1,1,3,3095,3107,3 -3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8 -3184,alias_default_789,call_function,alias.default,forward,,1,1,2,2,3101,2 -3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8 -3186,convert_element_type_673,call_function,convert_element_type.default,forward,,1,1,1,3101,8,6 -3187,dtype_cast_254,call_function,dtype_cast.default,forward,,1,1,1,2,3105,3 -3188,permute_308,call_function,permute.default,forward,,1,1,1,3,3104,3 -3189,alias_default_792,call_function,alias.default,forward,,1,1,2,3102,7,4 -3190,alias_default_793,call_function,alias.default,forward,,1,1,2,4,3103,3 -3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5 -3192,alias_default_1245,call_function,alias.default,forward,,1,1,0,3107,0,4 -3193,alias_default_3,call_function,alias.default,unknown,,1,1,2,1,3103,4 -3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5 -3195,permute_311,call_function,permute.default,backward,,1,1,1,5,3100,3 -3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5 -3197,permute_312,call_function,permute.default,backward,,1,1,1,3106,3,4 -3198,dtype_cast_255,call_function,dtype_cast.default,backward,,1,1,1,3107,2,4 -3199,convert_element_type_680,call_function,convert_element_type.default,backward,,1,1,1,9,3098,5 -3200,convert_element_type_681,call_function,convert_element_type.default,backward,,1,1,1,3089,3098,4 -3201,convert_element_type_682,call_function,convert_element_type.default,backward,,1,1,1,3,3092,2 -3202,alias_default_794,call_function,alias.default,backward,,1,1,2,10,3097,4 -3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8 -3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8 -3205,alias_default_795,call_function,alias.default,backward,,1,1,2,16,3090,4 -3206,alias_default_796,call_function,alias.default,backward,,1,1,3,3098,3096,4 -3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8 -3208,sum_1,call_function,sum.dim_IntList,backward,,1,1,1,3115,3088,5 -3209,div_28,call_function,div.Tensor,backward,,1,1,1,3099,3088,6 -3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8 -3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10 -3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8 -3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8 -3214,sum_2,call_function,sum.dim_IntList,backward,,1,1,1,3109,3,5 -3215,convert_element_type_683,call_function,convert_element_type.default,backward,,1,1,1,3120,3084,6 -3216,convert_element_type_684,call_function,convert_element_type.default,backward,,1,1,1,3110,2,3 -3217,dtype_cast_256,call_function,dtype_cast.default,backward,,1,1,1,3111,1,3 -3218,alias_default_1499,call_function,alias.default,backward,,1,1,0,3112,0,2 -3219,alias_default_797,call_function,alias.default,backward,,1,1,3,3121,3083,4 -3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5 -3221,permute_315,call_function,permute.default,backward,27,1,1,1,4,3079,3 -3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5 -3223,permute_316,call_function,permute.default,backward,27,1,1,1,3123,2,4 -3224,dtype_cast_257,call_function,dtype_cast.default,backward,27,1,1,1,3124,1,4 -3225,alias_default_1495,call_function,alias.default,backward,27,1,1,0,3125,0,3 -3226,alias_default_798,call_function,alias.default,backward,27,1,1,2,3124,3077,4 -3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8 -3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8 -3229,alias_default_799,call_function,alias.default,backward,27,1,1,2,3126,3064,4 -3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5 -3231,permute_319,call_function,permute.default,backward,27,1,1,1,4,3060,3 -3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5 -3233,permute_320,call_function,permute.default,backward,27,1,1,1,3128,2,4 -3234,dtype_cast_258,call_function,dtype_cast.default,backward,27,1,1,1,3129,1,4 -3235,alias_default_1496,call_function,alias.default,backward,27,1,1,0,3130,0,3 -3236,convert_element_type_693,call_function,convert_element_type.default,backward,27,1,1,1,3126,3068,6 -3237,convert_element_type_694,call_function,convert_element_type.default,backward,27,1,1,1,3066,3078,4 -3238,alias_default_800,call_function,alias.default,backward,27,1,1,2,3067,3077,4 -3239,neg_28,call_function,neg.default,backward,27,1,1,1,3068,3076,8 -3240,exp_28,call_function,exp.default,backward,27,1,1,1,3069,3075,6 -3241,add_141,call_function,add.Tensor,backward,27,1,1,1,3070,3074,4 -3242,reciprocal,call_function,reciprocal.default,backward,27,1,1,1,3071,3073,4 -3243,mul_206,call_function,mul.Tensor,backward,27,1,1,1,3072,3072,6 -3244,alias_default_801,call_function,alias.default,backward,27,1,1,2,3073,3071,4 -3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8 -3246,sub_1,call_function,sub.Tensor,backward,27,1,1,1,3074,3069,4 -3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8 -3248,add_142,call_function,add.Tensor,backward,27,1,1,1,3076,3067,4 -3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8 -3250,convert_element_type_695,call_function,convert_element_type.default,backward,27,1,1,1,3140,3065,6 -3251,alias_default_802,call_function,alias.default,backward,27,1,1,2,3141,3064,4 -3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5 -3253,permute_323,call_function,permute.default,backward,27,1,1,1,4,3060,3 -3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5 -3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10 -3256,permute_324,call_function,permute.default,backward,27,1,1,1,3143,2,4 -3257,dtype_cast_259,call_function,dtype_cast.default,backward,27,1,1,1,3144,1,4 -3258,alias_default_1494,call_function,alias.default,backward,27,1,1,0,3145,0,3 -3259,convert_element_type_700,call_function,convert_element_type.default,backward,27,1,1,1,3149,3057,8 -3260,convert_element_type_701,call_function,convert_element_type.default,backward,27,1,1,1,3046,3057,4 -3261,convert_element_type_702,call_function,convert_element_type.default,backward,27,1,1,1,3,3051,2 -3262,alias_default_803,call_function,alias.default,backward,27,1,1,2,3150,3056,4 -3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8 -3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8 -3265,alias_default_804,call_function,alias.default,backward,27,1,1,2,3153,3049,4 -3266,alias_default_805,call_function,alias.default,backward,27,1,1,3,3055,3055,4 -3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8 -3268,sum_3,call_function,sum.dim_IntList,backward,27,1,1,1,3158,3047,5 -3269,div_29,call_function,div.Tensor,backward,27,1,1,1,3056,3047,6 -3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8 -3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10 -3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8 -3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8 -3274,sum_4,call_function,sum.dim_IntList,backward,27,1,1,1,3155,3,5 -3275,convert_element_type_703,call_function,convert_element_type.default,backward,27,1,1,1,3163,3043,6 -3276,convert_element_type_704,call_function,convert_element_type.default,backward,27,1,1,1,3156,2,3 -3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10 -3278,dtype_cast_260,call_function,dtype_cast.default,backward,27,1,1,1,3157,1,3 -3279,alias_default_1498,call_function,alias.default,backward,27,1,1,0,3158,0,2 -3280,alias_default_806,call_function,alias.default,unknown,,1,1,3,3165,3041,4 -3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5 -3282,permute_327,call_function,permute.default,backward,27,1,1,1,4,3037,3 -3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5 -3284,permute_328,call_function,permute.default,backward,27,1,1,1,3167,2,4 -3285,dtype_cast_261,call_function,dtype_cast.default,backward,27,1,1,1,3168,1,4 -3286,alias_default_1493,call_function,alias.default,backward,27,1,1,0,3169,0,3 -3287,view_656,call_function,view.default,backward,27,1,1,1,3168,3035,4 -3288,permute_329,call_function,permute.default,backward,27,1,1,1,3169,3034,4 -3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2 -3290,getitem_252,call_function,getitem,backward,27,1,1,1,3174,3006,2 -3291,getitem_253,call_function,getitem,backward,27,1,1,1,3174,3007,2 -3292,getitem_254,call_function,getitem,backward,27,1,1,1,3174,3000,2 -3293,permute_330,call_function,permute.default,backward,27,1,1,1,3175,2999,2 -3294,permute_331,call_function,permute.default,backward,27,1,1,1,3175,3006,2 -3295,permute_332,call_function,permute.default,backward,27,1,1,1,3175,3005,2 -3296,convert_element_type_709,call_function,convert_element_type.default,backward,27,1,1,1,3176,3005,2 -3297,convert_element_type_710,call_function,convert_element_type.default,backward,27,1,1,1,3176,3004,2 -3298,view_657,call_function,view.default,backward,27,1,1,1,3177,3004,2 -3299,view_as_complex_56,call_function,view_as_complex.default,backward,27,1,1,1,3178,3003,6 -3300,_conj,call_function,_conj.default,backward,27,1,1,1,4,3004,3 -3301,clone_6,call_function,clone.default,backward,27,1,1,1,5,3003,3 -3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8 -3303,view_658,call_function,view.default,backward,27,1,1,1,3177,3003,2 -3304,view_as_complex_57,call_function,view_as_complex.default,backward,27,1,1,1,3178,3002,6 -3305,_conj_1,call_function,_conj.default,backward,27,1,1,1,4,3003,3 -3306,clone_7,call_function,clone.default,backward,27,1,1,1,5,3002,3 -3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8 -3308,view_as_real_56,call_function,view_as_real.default,backward,27,1,1,1,3182,3001,6 -3309,view_659,call_function,view.default,backward,27,1,1,1,3183,3000,6 -3310,convert_element_type_711,call_function,convert_element_type.default,backward,27,1,1,1,3184,2999,6 -3311,view_as_real_57,call_function,view_as_real.default,backward,27,1,1,1,3182,3000,6 -3312,view_660,call_function,view.default,backward,27,1,1,1,3183,2999,6 -3313,convert_element_type_712,call_function,convert_element_type.default,backward,27,1,1,1,3184,2998,6 -3314,view_661,call_function,view.default,backward,27,1,1,1,3176,2998,2 -3315,view_662,call_function,view.default,backward,27,1,1,1,3185,2998,5 -3316,view_663,call_function,view.default,backward,27,1,1,1,3185,2997,5 -3317,alias_default_807,call_function,alias.default,backward,27,1,1,2,3177,2997,4 -3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5 -3319,permute_335,call_function,permute.default,backward,27,1,1,1,4,2993,3 -3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5 -3321,permute_336,call_function,permute.default,backward,27,1,1,1,3179,2,4 -3322,dtype_cast_262,call_function,dtype_cast.default,backward,27,1,1,1,3180,1,4 -3323,alias_default_1492,call_function,alias.default,backward,27,1,1,0,3181,0,3 -3324,alias_default_808,call_function,alias.default,backward,27,1,1,2,3186,2997,4 -3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5 -3326,permute_339,call_function,permute.default,backward,27,1,1,1,4,2993,3 -3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5 -3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10 -3329,permute_340,call_function,permute.default,backward,27,1,1,1,3188,2,4 -3330,dtype_cast_263,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4 -3331,alias_default_1491,call_function,alias.default,backward,27,1,1,0,3190,0,3 -3332,alias_default_809,call_function,alias.default,backward,27,1,1,2,3186,2996,4 -3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5 -3334,permute_343,call_function,permute.default,backward,27,1,1,1,4,2992,3 -3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5 -3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10 -3337,permute_344,call_function,permute.default,backward,27,1,1,1,3188,2,4 -3338,dtype_cast_264,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4 -3339,alias_default_1490,call_function,alias.default,backward,27,1,1,0,3190,0,3 -3340,convert_element_type_725,call_function,convert_element_type.default,backward,27,1,1,1,3212,2989,8 -3341,convert_element_type_726,call_function,convert_element_type.default,backward,27,1,1,1,2979,2989,4 -3342,convert_element_type_727,call_function,convert_element_type.default,backward,27,1,1,1,3,2983,2 -3343,alias_default_810,call_function,alias.default,backward,27,1,1,2,3213,2988,4 -3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8 -3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8 -3346,alias_default_811,call_function,alias.default,backward,27,1,1,2,3216,2981,4 -3347,alias_default_812,call_function,alias.default,backward,27,1,1,3,2988,2987,4 -3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8 -3349,sum_5,call_function,sum.dim_IntList,backward,27,1,1,1,3221,2979,5 -3350,div_30,call_function,div.Tensor,backward,27,1,1,1,2989,2979,6 -3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8 -3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10 -3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8 -3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8 -3355,sum_6,call_function,sum.dim_IntList,backward,27,1,1,1,3218,3,5 -3356,convert_element_type_728,call_function,convert_element_type.default,backward,27,1,1,1,3226,2975,6 -3357,convert_element_type_729,call_function,convert_element_type.default,backward,27,1,1,1,3219,2,3 -3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10 -3359,dtype_cast_265,call_function,dtype_cast.default,backward,27,1,1,1,3220,1,3 -3360,alias_default_1497,call_function,alias.default,backward,27,1,1,0,3221,0,2 -3361,alias_default_813,call_function,alias.default,unknown,,1,1,3,3228,2973,4 -3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5 -3363,permute_347,call_function,permute.default,backward,26,1,1,1,4,2969,3 -3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5 -3365,permute_348,call_function,permute.default,backward,26,1,1,1,3230,2,4 -3366,dtype_cast_266,call_function,dtype_cast.default,backward,26,1,1,1,3231,1,4 -3367,alias_default_1486,call_function,alias.default,backward,26,1,1,0,3232,0,3 -3368,alias_default_814,call_function,alias.default,backward,26,1,1,2,3231,2967,4 -3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8 -3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8 -3371,alias_default_815,call_function,alias.default,backward,26,1,1,2,3233,2954,4 -3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5 -3373,permute_351,call_function,permute.default,backward,26,1,1,1,4,2950,3 -3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5 -3375,permute_352,call_function,permute.default,backward,26,1,1,1,3235,2,4 -3376,dtype_cast_267,call_function,dtype_cast.default,backward,26,1,1,1,3236,1,4 -3377,alias_default_1487,call_function,alias.default,backward,26,1,1,0,3237,0,3 -3378,convert_element_type_738,call_function,convert_element_type.default,backward,26,1,1,1,3233,2958,6 -3379,convert_element_type_739,call_function,convert_element_type.default,backward,26,1,1,1,2956,2968,4 -3380,alias_default_816,call_function,alias.default,backward,26,1,1,2,2957,2967,4 -3381,neg_29,call_function,neg.default,backward,26,1,1,1,2958,2966,8 -3382,exp_29,call_function,exp.default,backward,26,1,1,1,2959,2965,6 -3383,add_148,call_function,add.Tensor,backward,26,1,1,1,2960,2964,4 -3384,reciprocal_1,call_function,reciprocal.default,backward,26,1,1,1,2961,2963,4 -3385,mul_226,call_function,mul.Tensor,backward,26,1,1,1,2962,2962,6 -3386,alias_default_817,call_function,alias.default,backward,26,1,1,2,2963,2961,4 -3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8 -3388,sub_4,call_function,sub.Tensor,backward,26,1,1,1,2964,2959,4 -3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8 -3390,add_149,call_function,add.Tensor,backward,26,1,1,1,2966,2957,4 -3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8 -3392,convert_element_type_740,call_function,convert_element_type.default,backward,26,1,1,1,3247,2955,6 -3393,alias_default_818,call_function,alias.default,backward,26,1,1,2,3248,2954,4 -3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5 -3395,permute_355,call_function,permute.default,backward,26,1,1,1,4,2950,3 -3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5 -3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10 -3398,permute_356,call_function,permute.default,backward,26,1,1,1,3250,2,4 -3399,dtype_cast_268,call_function,dtype_cast.default,backward,26,1,1,1,3251,1,4 -3400,alias_default_1485,call_function,alias.default,backward,26,1,1,0,3252,0,3 -3401,convert_element_type_745,call_function,convert_element_type.default,backward,26,1,1,1,3256,2947,8 -3402,convert_element_type_746,call_function,convert_element_type.default,backward,26,1,1,1,2936,2947,4 -3403,convert_element_type_747,call_function,convert_element_type.default,backward,26,1,1,1,3,2941,2 -3404,alias_default_819,call_function,alias.default,backward,26,1,1,2,3257,2946,4 -3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8 -3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8 -3407,alias_default_820,call_function,alias.default,backward,26,1,1,2,3260,2939,4 -3408,alias_default_821,call_function,alias.default,backward,26,1,1,3,2945,2945,4 -3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8 -3410,sum_7,call_function,sum.dim_IntList,backward,26,1,1,1,3265,2937,5 -3411,div_31,call_function,div.Tensor,backward,26,1,1,1,2946,2937,6 -3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8 -3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10 -3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8 -3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8 -3416,sum_8,call_function,sum.dim_IntList,backward,26,1,1,1,3262,3,5 -3417,convert_element_type_748,call_function,convert_element_type.default,backward,26,1,1,1,3270,2933,6 -3418,convert_element_type_749,call_function,convert_element_type.default,backward,26,1,1,1,3263,2,3 -3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10 -3420,dtype_cast_269,call_function,dtype_cast.default,backward,26,1,1,1,3264,1,3 -3421,alias_default_1489,call_function,alias.default,backward,26,1,1,0,3265,0,2 -3422,alias_default_822,call_function,alias.default,unknown,,1,1,3,3272,2931,4 -3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5 -3424,permute_359,call_function,permute.default,backward,26,1,1,1,4,2927,3 -3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5 -3426,permute_360,call_function,permute.default,backward,26,1,1,1,3274,2,4 -3427,dtype_cast_270,call_function,dtype_cast.default,backward,26,1,1,1,3275,1,4 -3428,alias_default_1484,call_function,alias.default,backward,26,1,1,0,3276,0,3 -3429,view_678,call_function,view.default,backward,26,1,1,1,3275,2925,4 -3430,permute_361,call_function,permute.default,backward,26,1,1,1,3276,2924,4 -3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2 -3432,getitem_255,call_function,getitem,backward,26,1,1,1,3281,2896,2 -3433,getitem_256,call_function,getitem,backward,26,1,1,1,3281,2897,2 -3434,getitem_257,call_function,getitem,backward,26,1,1,1,3281,2890,2 -3435,permute_362,call_function,permute.default,backward,26,1,1,1,3282,2889,2 -3436,permute_363,call_function,permute.default,backward,26,1,1,1,3282,2896,2 -3437,permute_364,call_function,permute.default,backward,26,1,1,1,3282,2895,2 -3438,convert_element_type_754,call_function,convert_element_type.default,backward,26,1,1,1,3283,2895,2 -3439,convert_element_type_755,call_function,convert_element_type.default,backward,26,1,1,1,3283,2894,2 -3440,view_679,call_function,view.default,backward,26,1,1,1,3284,2894,2 -3441,view_as_complex_58,call_function,view_as_complex.default,backward,26,1,1,1,3285,2893,6 -3442,_conj_2,call_function,_conj.default,backward,26,1,1,1,4,2894,3 -3443,clone_14,call_function,clone.default,backward,26,1,1,1,5,2893,3 -3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8 -3445,view_680,call_function,view.default,backward,26,1,1,1,3284,2893,2 -3446,view_as_complex_59,call_function,view_as_complex.default,backward,26,1,1,1,3285,2892,6 -3447,_conj_3,call_function,_conj.default,backward,26,1,1,1,4,2893,3 -3448,clone_15,call_function,clone.default,backward,26,1,1,1,5,2892,3 -3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8 -3450,view_as_real_58,call_function,view_as_real.default,backward,26,1,1,1,3289,2891,6 -3451,view_681,call_function,view.default,backward,26,1,1,1,3290,2890,6 -3452,convert_element_type_756,call_function,convert_element_type.default,backward,26,1,1,1,3291,2889,6 -3453,view_as_real_59,call_function,view_as_real.default,backward,26,1,1,1,3289,2890,6 -3454,view_682,call_function,view.default,backward,26,1,1,1,3290,2889,6 -3455,convert_element_type_757,call_function,convert_element_type.default,backward,26,1,1,1,3291,2888,6 -3456,view_683,call_function,view.default,backward,26,1,1,1,3283,2888,2 -3457,view_684,call_function,view.default,backward,26,1,1,1,3292,2888,5 -3458,view_685,call_function,view.default,backward,26,1,1,1,3292,2887,5 -3459,alias_default_823,call_function,alias.default,backward,26,1,1,2,3284,2887,4 -3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5 -3461,permute_367,call_function,permute.default,backward,26,1,1,1,4,2883,3 -3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5 -3463,permute_368,call_function,permute.default,backward,26,1,1,1,3286,2,4 -3464,dtype_cast_271,call_function,dtype_cast.default,backward,26,1,1,1,3287,1,4 -3465,alias_default_1483,call_function,alias.default,backward,26,1,1,0,3288,0,3 -3466,alias_default_824,call_function,alias.default,backward,26,1,1,2,3293,2887,4 -3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5 -3468,permute_371,call_function,permute.default,backward,26,1,1,1,4,2883,3 -3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5 -3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10 -3471,permute_372,call_function,permute.default,backward,26,1,1,1,3295,2,4 -3472,dtype_cast_272,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4 -3473,alias_default_1482,call_function,alias.default,backward,26,1,1,0,3297,0,3 -3474,alias_default_825,call_function,alias.default,backward,26,1,1,2,3293,2886,4 -3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5 -3476,permute_375,call_function,permute.default,backward,26,1,1,1,4,2882,3 -3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5 -3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10 -3479,permute_376,call_function,permute.default,backward,26,1,1,1,3295,2,4 -3480,dtype_cast_273,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4 -3481,alias_default_1481,call_function,alias.default,backward,26,1,1,0,3297,0,3 -3482,convert_element_type_770,call_function,convert_element_type.default,backward,26,1,1,1,3319,2879,8 -3483,convert_element_type_771,call_function,convert_element_type.default,backward,26,1,1,1,2869,2879,4 -3484,convert_element_type_772,call_function,convert_element_type.default,backward,26,1,1,1,3,2873,2 -3485,alias_default_826,call_function,alias.default,backward,26,1,1,2,3320,2878,4 -3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8 -3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8 -3488,alias_default_827,call_function,alias.default,backward,26,1,1,2,3323,2871,4 -3489,alias_default_828,call_function,alias.default,backward,26,1,1,3,2878,2877,4 -3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8 -3491,sum_9,call_function,sum.dim_IntList,backward,26,1,1,1,3328,2869,5 -3492,div_32,call_function,div.Tensor,backward,26,1,1,1,2879,2869,6 -3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8 -3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10 -3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8 -3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8 -3497,sum_10,call_function,sum.dim_IntList,backward,26,1,1,1,3325,3,5 -3498,convert_element_type_773,call_function,convert_element_type.default,backward,26,1,1,1,3333,2865,6 -3499,convert_element_type_774,call_function,convert_element_type.default,backward,26,1,1,1,3326,2,3 -3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10 -3501,dtype_cast_274,call_function,dtype_cast.default,backward,26,1,1,1,3327,1,3 -3502,alias_default_1488,call_function,alias.default,backward,26,1,1,0,3328,0,2 -3503,alias_default_829,call_function,alias.default,unknown,,1,1,3,3335,2863,4 -3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5 -3505,permute_379,call_function,permute.default,backward,25,1,1,1,4,2859,3 -3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5 -3507,permute_380,call_function,permute.default,backward,25,1,1,1,3337,2,4 -3508,dtype_cast_275,call_function,dtype_cast.default,backward,25,1,1,1,3338,1,4 -3509,alias_default_1477,call_function,alias.default,backward,25,1,1,0,3339,0,3 -3510,alias_default_830,call_function,alias.default,backward,25,1,1,2,3338,2857,4 -3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8 -3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8 -3513,alias_default_831,call_function,alias.default,backward,25,1,1,2,3340,2844,4 -3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5 -3515,permute_383,call_function,permute.default,backward,25,1,1,1,4,2840,3 -3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5 -3517,permute_384,call_function,permute.default,backward,25,1,1,1,3342,2,4 -3518,dtype_cast_276,call_function,dtype_cast.default,backward,25,1,1,1,3343,1,4 -3519,alias_default_1478,call_function,alias.default,backward,25,1,1,0,3344,0,3 -3520,convert_element_type_783,call_function,convert_element_type.default,backward,25,1,1,1,3340,2848,6 -3521,convert_element_type_784,call_function,convert_element_type.default,backward,25,1,1,1,2846,2858,4 -3522,alias_default_832,call_function,alias.default,backward,25,1,1,2,2847,2857,4 -3523,neg_30,call_function,neg.default,backward,25,1,1,1,2848,2856,8 -3524,exp_30,call_function,exp.default,backward,25,1,1,1,2849,2855,6 -3525,add_155,call_function,add.Tensor,backward,25,1,1,1,2850,2854,4 -3526,reciprocal_2,call_function,reciprocal.default,backward,25,1,1,1,2851,2853,4 -3527,mul_246,call_function,mul.Tensor,backward,25,1,1,1,2852,2852,6 -3528,alias_default_833,call_function,alias.default,backward,25,1,1,2,2853,2851,4 -3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8 -3530,sub_7,call_function,sub.Tensor,backward,25,1,1,1,2854,2849,4 -3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8 -3532,add_156,call_function,add.Tensor,backward,25,1,1,1,2856,2847,4 -3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8 -3534,convert_element_type_785,call_function,convert_element_type.default,backward,25,1,1,1,3354,2845,6 -3535,alias_default_834,call_function,alias.default,backward,25,1,1,2,3355,2844,4 -3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5 -3537,permute_387,call_function,permute.default,backward,25,1,1,1,4,2840,3 -3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5 -3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10 -3540,permute_388,call_function,permute.default,backward,25,1,1,1,3357,2,4 -3541,dtype_cast_277,call_function,dtype_cast.default,backward,25,1,1,1,3358,1,4 -3542,alias_default_1476,call_function,alias.default,backward,25,1,1,0,3359,0,3 -3543,convert_element_type_790,call_function,convert_element_type.default,backward,25,1,1,1,3363,2837,8 -3544,convert_element_type_791,call_function,convert_element_type.default,backward,25,1,1,1,2826,2837,4 -3545,convert_element_type_792,call_function,convert_element_type.default,backward,25,1,1,1,3,2831,2 -3546,alias_default_835,call_function,alias.default,backward,25,1,1,2,3364,2836,4 -3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8 -3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8 -3549,alias_default_836,call_function,alias.default,backward,25,1,1,2,3367,2829,4 -3550,alias_default_837,call_function,alias.default,backward,25,1,1,3,2835,2835,4 -3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8 -3552,sum_11,call_function,sum.dim_IntList,backward,25,1,1,1,3372,2827,5 -3553,div_33,call_function,div.Tensor,backward,25,1,1,1,2836,2827,6 -3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8 -3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10 -3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8 -3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8 -3558,sum_12,call_function,sum.dim_IntList,backward,25,1,1,1,3369,3,5 -3559,convert_element_type_793,call_function,convert_element_type.default,backward,25,1,1,1,3377,2823,6 -3560,convert_element_type_794,call_function,convert_element_type.default,backward,25,1,1,1,3370,2,3 -3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10 -3562,dtype_cast_278,call_function,dtype_cast.default,backward,25,1,1,1,3371,1,3 -3563,alias_default_1480,call_function,alias.default,backward,25,1,1,0,3372,0,2 -3564,alias_default_838,call_function,alias.default,unknown,,1,1,3,3379,2821,4 -3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5 -3566,permute_391,call_function,permute.default,backward,25,1,1,1,4,2817,3 -3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5 -3568,permute_392,call_function,permute.default,backward,25,1,1,1,3381,2,4 -3569,dtype_cast_279,call_function,dtype_cast.default,backward,25,1,1,1,3382,1,4 -3570,alias_default_1475,call_function,alias.default,backward,25,1,1,0,3383,0,3 -3571,view_700,call_function,view.default,backward,25,1,1,1,3382,2815,4 -3572,permute_393,call_function,permute.default,backward,25,1,1,1,3383,2814,4 -3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2 -3574,getitem_258,call_function,getitem,backward,25,1,1,1,3388,2786,2 -3575,getitem_259,call_function,getitem,backward,25,1,1,1,3388,2787,2 -3576,getitem_260,call_function,getitem,backward,25,1,1,1,3388,2780,2 -3577,permute_394,call_function,permute.default,backward,25,1,1,1,3389,2779,2 -3578,permute_395,call_function,permute.default,backward,25,1,1,1,3389,2786,2 -3579,permute_396,call_function,permute.default,backward,25,1,1,1,3389,2785,2 -3580,convert_element_type_799,call_function,convert_element_type.default,backward,25,1,1,1,3390,2785,2 -3581,convert_element_type_800,call_function,convert_element_type.default,backward,25,1,1,1,3390,2784,2 -3582,view_701,call_function,view.default,backward,25,1,1,1,3391,2784,2 -3583,view_as_complex_60,call_function,view_as_complex.default,backward,25,1,1,1,3392,2783,6 -3584,_conj_4,call_function,_conj.default,backward,25,1,1,1,4,2784,3 -3585,clone_22,call_function,clone.default,backward,25,1,1,1,5,2783,3 -3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8 -3587,view_702,call_function,view.default,backward,25,1,1,1,3391,2783,2 -3588,view_as_complex_61,call_function,view_as_complex.default,backward,25,1,1,1,3392,2782,6 -3589,_conj_5,call_function,_conj.default,backward,25,1,1,1,4,2783,3 -3590,clone_23,call_function,clone.default,backward,25,1,1,1,5,2782,3 -3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8 -3592,view_as_real_60,call_function,view_as_real.default,backward,25,1,1,1,3396,2781,6 -3593,view_703,call_function,view.default,backward,25,1,1,1,3397,2780,6 -3594,convert_element_type_801,call_function,convert_element_type.default,backward,25,1,1,1,3398,2779,6 -3595,view_as_real_61,call_function,view_as_real.default,backward,25,1,1,1,3396,2780,6 -3596,view_704,call_function,view.default,backward,25,1,1,1,3397,2779,6 -3597,convert_element_type_802,call_function,convert_element_type.default,backward,25,1,1,1,3398,2778,6 -3598,view_705,call_function,view.default,backward,25,1,1,1,3390,2778,2 -3599,view_706,call_function,view.default,backward,25,1,1,1,3399,2778,5 -3600,view_707,call_function,view.default,backward,25,1,1,1,3399,2777,5 -3601,alias_default_839,call_function,alias.default,backward,25,1,1,2,3391,2777,4 -3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5 -3603,permute_399,call_function,permute.default,backward,25,1,1,1,4,2773,3 -3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5 -3605,permute_400,call_function,permute.default,backward,25,1,1,1,3393,2,4 -3606,dtype_cast_280,call_function,dtype_cast.default,backward,25,1,1,1,3394,1,4 -3607,alias_default_1474,call_function,alias.default,backward,25,1,1,0,3395,0,3 -3608,alias_default_840,call_function,alias.default,backward,25,1,1,2,3400,2777,4 -3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5 -3610,permute_403,call_function,permute.default,backward,25,1,1,1,4,2773,3 -3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5 -3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10 -3613,permute_404,call_function,permute.default,backward,25,1,1,1,3402,2,4 -3614,dtype_cast_281,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4 -3615,alias_default_1473,call_function,alias.default,backward,25,1,1,0,3404,0,3 -3616,alias_default_841,call_function,alias.default,backward,25,1,1,2,3400,2776,4 -3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5 -3618,permute_407,call_function,permute.default,backward,25,1,1,1,4,2772,3 -3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5 -3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10 -3621,permute_408,call_function,permute.default,backward,25,1,1,1,3402,2,4 -3622,dtype_cast_282,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4 -3623,alias_default_1472,call_function,alias.default,backward,25,1,1,0,3404,0,3 -3624,convert_element_type_815,call_function,convert_element_type.default,backward,25,1,1,1,3426,2769,8 -3625,convert_element_type_816,call_function,convert_element_type.default,backward,25,1,1,1,2759,2769,4 -3626,convert_element_type_817,call_function,convert_element_type.default,backward,25,1,1,1,3,2763,2 -3627,alias_default_842,call_function,alias.default,backward,25,1,1,2,3427,2768,4 -3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8 -3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8 -3630,alias_default_843,call_function,alias.default,backward,25,1,1,2,3430,2761,4 -3631,alias_default_844,call_function,alias.default,backward,25,1,1,3,2768,2767,4 -3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8 -3633,sum_13,call_function,sum.dim_IntList,backward,25,1,1,1,3435,2759,5 -3634,div_34,call_function,div.Tensor,backward,25,1,1,1,2769,2759,6 -3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8 -3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10 -3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8 -3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8 -3639,sum_14,call_function,sum.dim_IntList,backward,25,1,1,1,3432,3,5 -3640,convert_element_type_818,call_function,convert_element_type.default,backward,25,1,1,1,3440,2755,6 -3641,convert_element_type_819,call_function,convert_element_type.default,backward,25,1,1,1,3433,2,3 -3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10 -3643,dtype_cast_283,call_function,dtype_cast.default,backward,25,1,1,1,3434,1,3 -3644,alias_default_1479,call_function,alias.default,backward,25,1,1,0,3435,0,2 -3645,alias_default_845,call_function,alias.default,unknown,,1,1,3,3442,2753,4 -3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5 -3647,permute_411,call_function,permute.default,backward,24,1,1,1,4,2749,3 -3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5 -3649,permute_412,call_function,permute.default,backward,24,1,1,1,3444,2,4 -3650,dtype_cast_284,call_function,dtype_cast.default,backward,24,1,1,1,3445,1,4 -3651,alias_default_1468,call_function,alias.default,backward,24,1,1,0,3446,0,3 -3652,alias_default_846,call_function,alias.default,backward,24,1,1,2,3445,2747,4 -3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8 -3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8 -3655,alias_default_847,call_function,alias.default,backward,24,1,1,2,3447,2734,4 -3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5 -3657,permute_415,call_function,permute.default,backward,24,1,1,1,4,2730,3 -3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5 -3659,permute_416,call_function,permute.default,backward,24,1,1,1,3449,2,4 -3660,dtype_cast_285,call_function,dtype_cast.default,backward,24,1,1,1,3450,1,4 -3661,alias_default_1469,call_function,alias.default,backward,24,1,1,0,3451,0,3 -3662,convert_element_type_828,call_function,convert_element_type.default,backward,24,1,1,1,3447,2738,6 -3663,convert_element_type_829,call_function,convert_element_type.default,backward,24,1,1,1,2736,2748,4 -3664,alias_default_848,call_function,alias.default,backward,24,1,1,2,2737,2747,4 -3665,neg_31,call_function,neg.default,backward,24,1,1,1,2738,2746,8 -3666,exp_31,call_function,exp.default,backward,24,1,1,1,2739,2745,6 -3667,add_162,call_function,add.Tensor,backward,24,1,1,1,2740,2744,4 -3668,reciprocal_3,call_function,reciprocal.default,backward,24,1,1,1,2741,2743,4 -3669,mul_266,call_function,mul.Tensor,backward,24,1,1,1,2742,2742,6 -3670,alias_default_849,call_function,alias.default,backward,24,1,1,2,2743,2741,4 -3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8 -3672,sub_10,call_function,sub.Tensor,backward,24,1,1,1,2744,2739,4 -3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8 -3674,add_163,call_function,add.Tensor,backward,24,1,1,1,2746,2737,4 -3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8 -3676,convert_element_type_830,call_function,convert_element_type.default,backward,24,1,1,1,3461,2735,6 -3677,alias_default_850,call_function,alias.default,backward,24,1,1,2,3462,2734,4 -3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5 -3679,permute_419,call_function,permute.default,backward,24,1,1,1,4,2730,3 -3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5 -3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10 -3682,permute_420,call_function,permute.default,backward,24,1,1,1,3464,2,4 -3683,dtype_cast_286,call_function,dtype_cast.default,backward,24,1,1,1,3465,1,4 -3684,alias_default_1467,call_function,alias.default,backward,24,1,1,0,3466,0,3 -3685,convert_element_type_835,call_function,convert_element_type.default,backward,24,1,1,1,3470,2727,8 -3686,convert_element_type_836,call_function,convert_element_type.default,backward,24,1,1,1,2716,2727,4 -3687,convert_element_type_837,call_function,convert_element_type.default,backward,24,1,1,1,3,2721,2 -3688,alias_default_851,call_function,alias.default,backward,24,1,1,2,3471,2726,4 -3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8 -3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8 -3691,alias_default_852,call_function,alias.default,backward,24,1,1,2,3474,2719,4 -3692,alias_default_853,call_function,alias.default,backward,24,1,1,3,2725,2725,4 -3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8 -3694,sum_15,call_function,sum.dim_IntList,backward,24,1,1,1,3479,2717,5 -3695,div_35,call_function,div.Tensor,backward,24,1,1,1,2726,2717,6 -3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8 -3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10 -3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8 -3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8 -3700,sum_16,call_function,sum.dim_IntList,backward,24,1,1,1,3476,3,5 -3701,convert_element_type_838,call_function,convert_element_type.default,backward,24,1,1,1,3484,2713,6 -3702,convert_element_type_839,call_function,convert_element_type.default,backward,24,1,1,1,3477,2,3 -3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10 -3704,dtype_cast_287,call_function,dtype_cast.default,backward,24,1,1,1,3478,1,3 -3705,alias_default_1471,call_function,alias.default,backward,24,1,1,0,3479,0,2 -3706,alias_default_854,call_function,alias.default,unknown,,1,1,3,3486,2711,4 -3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5 -3708,permute_423,call_function,permute.default,backward,24,1,1,1,4,2707,3 -3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5 -3710,permute_424,call_function,permute.default,backward,24,1,1,1,3488,2,4 -3711,dtype_cast_288,call_function,dtype_cast.default,backward,24,1,1,1,3489,1,4 -3712,alias_default_1466,call_function,alias.default,backward,24,1,1,0,3490,0,3 -3713,view_722,call_function,view.default,backward,24,1,1,1,3489,2705,4 -3714,permute_425,call_function,permute.default,backward,24,1,1,1,3490,2704,4 -3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2 -3716,getitem_261,call_function,getitem,backward,24,1,1,1,3495,2676,2 -3717,getitem_262,call_function,getitem,backward,24,1,1,1,3495,2677,2 -3718,getitem_263,call_function,getitem,backward,24,1,1,1,3495,2670,2 -3719,permute_426,call_function,permute.default,backward,24,1,1,1,3496,2669,2 -3720,permute_427,call_function,permute.default,backward,24,1,1,1,3496,2676,2 -3721,permute_428,call_function,permute.default,backward,24,1,1,1,3496,2675,2 -3722,convert_element_type_844,call_function,convert_element_type.default,backward,24,1,1,1,3497,2675,2 -3723,convert_element_type_845,call_function,convert_element_type.default,backward,24,1,1,1,3497,2674,2 -3724,view_723,call_function,view.default,backward,24,1,1,1,3498,2674,2 -3725,view_as_complex_62,call_function,view_as_complex.default,backward,24,1,1,1,3499,2673,6 -3726,_conj_6,call_function,_conj.default,backward,24,1,1,1,4,2674,3 -3727,clone_30,call_function,clone.default,backward,24,1,1,1,5,2673,3 -3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8 -3729,view_724,call_function,view.default,backward,24,1,1,1,3498,2673,2 -3730,view_as_complex_63,call_function,view_as_complex.default,backward,24,1,1,1,3499,2672,6 -3731,_conj_7,call_function,_conj.default,backward,24,1,1,1,4,2673,3 -3732,clone_31,call_function,clone.default,backward,24,1,1,1,5,2672,3 -3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8 -3734,view_as_real_62,call_function,view_as_real.default,backward,24,1,1,1,3503,2671,6 -3735,view_725,call_function,view.default,backward,24,1,1,1,3504,2670,6 -3736,convert_element_type_846,call_function,convert_element_type.default,backward,24,1,1,1,3505,2669,6 -3737,view_as_real_63,call_function,view_as_real.default,backward,24,1,1,1,3503,2670,6 -3738,view_726,call_function,view.default,backward,24,1,1,1,3504,2669,6 -3739,convert_element_type_847,call_function,convert_element_type.default,backward,24,1,1,1,3505,2668,6 -3740,view_727,call_function,view.default,backward,24,1,1,1,3497,2668,2 -3741,view_728,call_function,view.default,backward,24,1,1,1,3506,2668,5 -3742,view_729,call_function,view.default,backward,24,1,1,1,3506,2667,5 -3743,alias_default_855,call_function,alias.default,backward,24,1,1,2,3498,2667,4 -3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5 -3745,permute_431,call_function,permute.default,backward,24,1,1,1,4,2663,3 -3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5 -3747,permute_432,call_function,permute.default,backward,24,1,1,1,3500,2,4 -3748,dtype_cast_289,call_function,dtype_cast.default,backward,24,1,1,1,3501,1,4 -3749,alias_default_1465,call_function,alias.default,backward,24,1,1,0,3502,0,3 -3750,alias_default_856,call_function,alias.default,backward,24,1,1,2,3507,2667,4 -3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5 -3752,permute_435,call_function,permute.default,backward,24,1,1,1,4,2663,3 -3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5 -3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10 -3755,permute_436,call_function,permute.default,backward,24,1,1,1,3509,2,4 -3756,dtype_cast_290,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4 -3757,alias_default_1464,call_function,alias.default,backward,24,1,1,0,3511,0,3 -3758,alias_default_857,call_function,alias.default,backward,24,1,1,2,3507,2666,4 -3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5 -3760,permute_439,call_function,permute.default,backward,24,1,1,1,4,2662,3 -3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5 -3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10 -3763,permute_440,call_function,permute.default,backward,24,1,1,1,3509,2,4 -3764,dtype_cast_291,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4 -3765,alias_default_1463,call_function,alias.default,backward,24,1,1,0,3511,0,3 -3766,convert_element_type_860,call_function,convert_element_type.default,backward,24,1,1,1,3533,2659,8 -3767,convert_element_type_861,call_function,convert_element_type.default,backward,24,1,1,1,2649,2659,4 -3768,convert_element_type_862,call_function,convert_element_type.default,backward,24,1,1,1,3,2653,2 -3769,alias_default_858,call_function,alias.default,backward,24,1,1,2,3534,2658,4 -3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8 -3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8 -3772,alias_default_859,call_function,alias.default,backward,24,1,1,2,3537,2651,4 -3773,alias_default_860,call_function,alias.default,backward,24,1,1,3,2658,2657,4 -3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8 -3775,sum_17,call_function,sum.dim_IntList,backward,24,1,1,1,3542,2649,5 -3776,div_36,call_function,div.Tensor,backward,24,1,1,1,2659,2649,6 -3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8 -3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10 -3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8 -3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8 -3781,sum_18,call_function,sum.dim_IntList,backward,24,1,1,1,3539,3,5 -3782,convert_element_type_863,call_function,convert_element_type.default,backward,24,1,1,1,3547,2645,6 -3783,convert_element_type_864,call_function,convert_element_type.default,backward,24,1,1,1,3540,2,3 -3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10 -3785,dtype_cast_292,call_function,dtype_cast.default,backward,24,1,1,1,3541,1,3 -3786,alias_default_1470,call_function,alias.default,backward,24,1,1,0,3542,0,2 -3787,alias_default_861,call_function,alias.default,unknown,,1,1,3,3549,2643,4 -3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5 -3789,permute_443,call_function,permute.default,backward,23,1,1,1,4,2639,3 -3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5 -3791,permute_444,call_function,permute.default,backward,23,1,1,1,3551,2,4 -3792,dtype_cast_293,call_function,dtype_cast.default,backward,23,1,1,1,3552,1,4 -3793,alias_default_1459,call_function,alias.default,backward,23,1,1,0,3553,0,3 -3794,alias_default_862,call_function,alias.default,backward,23,1,1,2,3552,2637,4 -3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8 -3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8 -3797,alias_default_863,call_function,alias.default,backward,23,1,1,2,3554,2624,4 -3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5 -3799,permute_447,call_function,permute.default,backward,23,1,1,1,4,2620,3 -3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5 -3801,permute_448,call_function,permute.default,backward,23,1,1,1,3556,2,4 -3802,dtype_cast_294,call_function,dtype_cast.default,backward,23,1,1,1,3557,1,4 -3803,alias_default_1460,call_function,alias.default,backward,23,1,1,0,3558,0,3 -3804,convert_element_type_873,call_function,convert_element_type.default,backward,23,1,1,1,3554,2628,6 -3805,convert_element_type_874,call_function,convert_element_type.default,backward,23,1,1,1,2626,2638,4 -3806,alias_default_864,call_function,alias.default,backward,23,1,1,2,2627,2637,4 -3807,neg_32,call_function,neg.default,backward,23,1,1,1,2628,2636,8 -3808,exp_32,call_function,exp.default,backward,23,1,1,1,2629,2635,6 -3809,add_169,call_function,add.Tensor,backward,23,1,1,1,2630,2634,4 -3810,reciprocal_4,call_function,reciprocal.default,backward,23,1,1,1,2631,2633,4 -3811,mul_286,call_function,mul.Tensor,backward,23,1,1,1,2632,2632,6 -3812,alias_default_865,call_function,alias.default,backward,23,1,1,2,2633,2631,4 -3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8 -3814,sub_13,call_function,sub.Tensor,backward,23,1,1,1,2634,2629,4 -3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8 -3816,add_170,call_function,add.Tensor,backward,23,1,1,1,2636,2627,4 -3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8 -3818,convert_element_type_875,call_function,convert_element_type.default,backward,23,1,1,1,3568,2625,6 -3819,alias_default_866,call_function,alias.default,backward,23,1,1,2,3569,2624,4 -3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5 -3821,permute_451,call_function,permute.default,backward,23,1,1,1,4,2620,3 -3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5 -3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10 -3824,permute_452,call_function,permute.default,backward,23,1,1,1,3571,2,4 -3825,dtype_cast_295,call_function,dtype_cast.default,backward,23,1,1,1,3572,1,4 -3826,alias_default_1458,call_function,alias.default,backward,23,1,1,0,3573,0,3 -3827,convert_element_type_880,call_function,convert_element_type.default,backward,23,1,1,1,3577,2617,8 -3828,convert_element_type_881,call_function,convert_element_type.default,backward,23,1,1,1,2606,2617,4 -3829,convert_element_type_882,call_function,convert_element_type.default,backward,23,1,1,1,3,2611,2 -3830,alias_default_867,call_function,alias.default,backward,23,1,1,2,3578,2616,4 -3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8 -3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8 -3833,alias_default_868,call_function,alias.default,backward,23,1,1,2,3581,2609,4 -3834,alias_default_869,call_function,alias.default,backward,23,1,1,3,2615,2615,4 -3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8 -3836,sum_19,call_function,sum.dim_IntList,backward,23,1,1,1,3586,2607,5 -3837,div_37,call_function,div.Tensor,backward,23,1,1,1,2616,2607,6 -3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8 -3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10 -3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8 -3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8 -3842,sum_20,call_function,sum.dim_IntList,backward,23,1,1,1,3583,3,5 -3843,convert_element_type_883,call_function,convert_element_type.default,backward,23,1,1,1,3591,2603,6 -3844,convert_element_type_884,call_function,convert_element_type.default,backward,23,1,1,1,3584,2,3 -3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10 -3846,dtype_cast_296,call_function,dtype_cast.default,backward,23,1,1,1,3585,1,3 -3847,alias_default_1462,call_function,alias.default,backward,23,1,1,0,3586,0,2 -3848,alias_default_870,call_function,alias.default,unknown,,1,1,3,3593,2601,4 -3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5 -3850,permute_455,call_function,permute.default,backward,23,1,1,1,4,2597,3 -3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5 -3852,permute_456,call_function,permute.default,backward,23,1,1,1,3595,2,4 -3853,dtype_cast_297,call_function,dtype_cast.default,backward,23,1,1,1,3596,1,4 -3854,alias_default_1457,call_function,alias.default,backward,23,1,1,0,3597,0,3 -3855,view_744,call_function,view.default,backward,23,1,1,1,3596,2595,4 -3856,permute_457,call_function,permute.default,backward,23,1,1,1,3597,2594,4 -3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2 -3858,getitem_264,call_function,getitem,backward,23,1,1,1,3602,2566,2 -3859,getitem_265,call_function,getitem,backward,23,1,1,1,3602,2567,2 -3860,getitem_266,call_function,getitem,backward,23,1,1,1,3602,2560,2 -3861,permute_458,call_function,permute.default,backward,23,1,1,1,3603,2559,2 -3862,permute_459,call_function,permute.default,backward,23,1,1,1,3603,2566,2 -3863,permute_460,call_function,permute.default,backward,23,1,1,1,3603,2565,2 -3864,convert_element_type_889,call_function,convert_element_type.default,backward,23,1,1,1,3604,2565,2 -3865,convert_element_type_890,call_function,convert_element_type.default,backward,23,1,1,1,3604,2564,2 -3866,view_745,call_function,view.default,backward,23,1,1,1,3605,2564,2 -3867,view_as_complex_64,call_function,view_as_complex.default,backward,23,1,1,1,3606,2563,6 -3868,_conj_8,call_function,_conj.default,backward,23,1,1,1,4,2564,3 -3869,clone_38,call_function,clone.default,backward,23,1,1,1,5,2563,3 -3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8 -3871,view_746,call_function,view.default,backward,23,1,1,1,3605,2563,2 -3872,view_as_complex_65,call_function,view_as_complex.default,backward,23,1,1,1,3606,2562,6 -3873,_conj_9,call_function,_conj.default,backward,23,1,1,1,4,2563,3 -3874,clone_39,call_function,clone.default,backward,23,1,1,1,5,2562,3 -3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8 -3876,view_as_real_64,call_function,view_as_real.default,backward,23,1,1,1,3610,2561,6 -3877,view_747,call_function,view.default,backward,23,1,1,1,3611,2560,6 -3878,convert_element_type_891,call_function,convert_element_type.default,backward,23,1,1,1,3612,2559,6 -3879,view_as_real_65,call_function,view_as_real.default,backward,23,1,1,1,3610,2560,6 -3880,view_748,call_function,view.default,backward,23,1,1,1,3611,2559,6 -3881,convert_element_type_892,call_function,convert_element_type.default,backward,23,1,1,1,3612,2558,6 -3882,view_749,call_function,view.default,backward,23,1,1,1,3604,2558,2 -3883,view_750,call_function,view.default,backward,23,1,1,1,3613,2558,5 -3884,view_751,call_function,view.default,backward,23,1,1,1,3613,2557,5 -3885,alias_default_871,call_function,alias.default,backward,23,1,1,2,3605,2557,4 -3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5 -3887,permute_463,call_function,permute.default,backward,23,1,1,1,4,2553,3 -3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5 -3889,permute_464,call_function,permute.default,backward,23,1,1,1,3607,2,4 -3890,dtype_cast_298,call_function,dtype_cast.default,backward,23,1,1,1,3608,1,4 -3891,alias_default_1456,call_function,alias.default,backward,23,1,1,0,3609,0,3 -3892,alias_default_872,call_function,alias.default,backward,23,1,1,2,3614,2557,4 -3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5 -3894,permute_467,call_function,permute.default,backward,23,1,1,1,4,2553,3 -3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5 -3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10 -3897,permute_468,call_function,permute.default,backward,23,1,1,1,3616,2,4 -3898,dtype_cast_299,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4 -3899,alias_default_1455,call_function,alias.default,backward,23,1,1,0,3618,0,3 -3900,alias_default_873,call_function,alias.default,backward,23,1,1,2,3614,2556,4 -3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5 -3902,permute_471,call_function,permute.default,backward,23,1,1,1,4,2552,3 -3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5 -3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10 -3905,permute_472,call_function,permute.default,backward,23,1,1,1,3616,2,4 -3906,dtype_cast_300,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4 -3907,alias_default_1454,call_function,alias.default,backward,23,1,1,0,3618,0,3 -3908,convert_element_type_905,call_function,convert_element_type.default,backward,23,1,1,1,3640,2549,8 -3909,convert_element_type_906,call_function,convert_element_type.default,backward,23,1,1,1,2539,2549,4 -3910,convert_element_type_907,call_function,convert_element_type.default,backward,23,1,1,1,3,2543,2 -3911,alias_default_874,call_function,alias.default,backward,23,1,1,2,3641,2548,4 -3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8 -3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8 -3914,alias_default_875,call_function,alias.default,backward,23,1,1,2,3644,2541,4 -3915,alias_default_876,call_function,alias.default,backward,23,1,1,3,2548,2547,4 -3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8 -3917,sum_21,call_function,sum.dim_IntList,backward,23,1,1,1,3649,2539,5 -3918,div_38,call_function,div.Tensor,backward,23,1,1,1,2549,2539,6 -3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8 -3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10 -3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8 -3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8 -3923,sum_22,call_function,sum.dim_IntList,backward,23,1,1,1,3646,3,5 -3924,convert_element_type_908,call_function,convert_element_type.default,backward,23,1,1,1,3654,2535,6 -3925,convert_element_type_909,call_function,convert_element_type.default,backward,23,1,1,1,3647,2,3 -3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10 -3927,dtype_cast_301,call_function,dtype_cast.default,backward,23,1,1,1,3648,1,3 -3928,alias_default_1461,call_function,alias.default,backward,23,1,1,0,3649,0,2 -3929,alias_default_877,call_function,alias.default,unknown,,1,1,3,3656,2533,4 -3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5 -3931,permute_475,call_function,permute.default,backward,22,1,1,1,4,2529,3 -3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5 -3933,permute_476,call_function,permute.default,backward,22,1,1,1,3658,2,4 -3934,dtype_cast_302,call_function,dtype_cast.default,backward,22,1,1,1,3659,1,4 -3935,alias_default_1450,call_function,alias.default,backward,22,1,1,0,3660,0,3 -3936,alias_default_878,call_function,alias.default,backward,22,1,1,2,3659,2527,4 -3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8 -3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8 -3939,alias_default_879,call_function,alias.default,backward,22,1,1,2,3661,2514,4 -3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5 -3941,permute_479,call_function,permute.default,backward,22,1,1,1,4,2510,3 -3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5 -3943,permute_480,call_function,permute.default,backward,22,1,1,1,3663,2,4 -3944,dtype_cast_303,call_function,dtype_cast.default,backward,22,1,1,1,3664,1,4 -3945,alias_default_1451,call_function,alias.default,backward,22,1,1,0,3665,0,3 -3946,convert_element_type_918,call_function,convert_element_type.default,backward,22,1,1,1,3661,2518,6 -3947,convert_element_type_919,call_function,convert_element_type.default,backward,22,1,1,1,2516,2528,4 -3948,alias_default_880,call_function,alias.default,backward,22,1,1,2,2517,2527,4 -3949,neg_33,call_function,neg.default,backward,22,1,1,1,2518,2526,8 -3950,exp_33,call_function,exp.default,backward,22,1,1,1,2519,2525,6 -3951,add_176,call_function,add.Tensor,backward,22,1,1,1,2520,2524,4 -3952,reciprocal_5,call_function,reciprocal.default,backward,22,1,1,1,2521,2523,4 -3953,mul_306,call_function,mul.Tensor,backward,22,1,1,1,2522,2522,6 -3954,alias_default_881,call_function,alias.default,backward,22,1,1,2,2523,2521,4 -3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8 -3956,sub_16,call_function,sub.Tensor,backward,22,1,1,1,2524,2519,4 -3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8 -3958,add_177,call_function,add.Tensor,backward,22,1,1,1,2526,2517,4 -3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8 -3960,convert_element_type_920,call_function,convert_element_type.default,backward,22,1,1,1,3675,2515,6 -3961,alias_default_882,call_function,alias.default,backward,22,1,1,2,3676,2514,4 -3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5 -3963,permute_483,call_function,permute.default,backward,22,1,1,1,4,2510,3 -3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5 -3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10 -3966,permute_484,call_function,permute.default,backward,22,1,1,1,3678,2,4 -3967,dtype_cast_304,call_function,dtype_cast.default,backward,22,1,1,1,3679,1,4 -3968,alias_default_1449,call_function,alias.default,backward,22,1,1,0,3680,0,3 -3969,convert_element_type_925,call_function,convert_element_type.default,backward,22,1,1,1,3684,2507,8 -3970,convert_element_type_926,call_function,convert_element_type.default,backward,22,1,1,1,2496,2507,4 -3971,convert_element_type_927,call_function,convert_element_type.default,backward,22,1,1,1,3,2501,2 -3972,alias_default_883,call_function,alias.default,backward,22,1,1,2,3685,2506,4 -3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8 -3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8 -3975,alias_default_884,call_function,alias.default,backward,22,1,1,2,3688,2499,4 -3976,alias_default_885,call_function,alias.default,backward,22,1,1,3,2505,2505,4 -3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8 -3978,sum_23,call_function,sum.dim_IntList,backward,22,1,1,1,3693,2497,5 -3979,div_39,call_function,div.Tensor,backward,22,1,1,1,2506,2497,6 -3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8 -3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10 -3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8 -3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8 -3984,sum_24,call_function,sum.dim_IntList,backward,22,1,1,1,3690,3,5 -3985,convert_element_type_928,call_function,convert_element_type.default,backward,22,1,1,1,3698,2493,6 -3986,convert_element_type_929,call_function,convert_element_type.default,backward,22,1,1,1,3691,2,3 -3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10 -3988,dtype_cast_305,call_function,dtype_cast.default,backward,22,1,1,1,3692,1,3 -3989,alias_default_1453,call_function,alias.default,backward,22,1,1,0,3693,0,2 -3990,alias_default_886,call_function,alias.default,unknown,,1,1,3,3700,2491,4 -3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5 -3992,permute_487,call_function,permute.default,backward,22,1,1,1,4,2487,3 -3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5 -3994,permute_488,call_function,permute.default,backward,22,1,1,1,3702,2,4 -3995,dtype_cast_306,call_function,dtype_cast.default,backward,22,1,1,1,3703,1,4 -3996,alias_default_1448,call_function,alias.default,backward,22,1,1,0,3704,0,3 -3997,view_766,call_function,view.default,backward,22,1,1,1,3703,2485,4 -3998,permute_489,call_function,permute.default,backward,22,1,1,1,3704,2484,4 -3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2 -4000,getitem_267,call_function,getitem,backward,22,1,1,1,3709,2456,2 -4001,getitem_268,call_function,getitem,backward,22,1,1,1,3709,2457,2 -4002,getitem_269,call_function,getitem,backward,22,1,1,1,3709,2450,2 -4003,permute_490,call_function,permute.default,backward,22,1,1,1,3710,2449,2 -4004,permute_491,call_function,permute.default,backward,22,1,1,1,3710,2456,2 -4005,permute_492,call_function,permute.default,backward,22,1,1,1,3710,2455,2 -4006,convert_element_type_934,call_function,convert_element_type.default,backward,22,1,1,1,3711,2455,2 -4007,convert_element_type_935,call_function,convert_element_type.default,backward,22,1,1,1,3711,2454,2 -4008,view_767,call_function,view.default,backward,22,1,1,1,3712,2454,2 -4009,view_as_complex_66,call_function,view_as_complex.default,backward,22,1,1,1,3713,2453,6 -4010,_conj_10,call_function,_conj.default,backward,22,1,1,1,4,2454,3 -4011,clone_46,call_function,clone.default,backward,22,1,1,1,5,2453,3 -4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8 -4013,view_768,call_function,view.default,backward,22,1,1,1,3712,2453,2 -4014,view_as_complex_67,call_function,view_as_complex.default,backward,22,1,1,1,3713,2452,6 -4015,_conj_11,call_function,_conj.default,backward,22,1,1,1,4,2453,3 -4016,clone_47,call_function,clone.default,backward,22,1,1,1,5,2452,3 -4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8 -4018,view_as_real_66,call_function,view_as_real.default,backward,22,1,1,1,3717,2451,6 -4019,view_769,call_function,view.default,backward,22,1,1,1,3718,2450,6 -4020,convert_element_type_936,call_function,convert_element_type.default,backward,22,1,1,1,3719,2449,6 -4021,view_as_real_67,call_function,view_as_real.default,backward,22,1,1,1,3717,2450,6 -4022,view_770,call_function,view.default,backward,22,1,1,1,3718,2449,6 -4023,convert_element_type_937,call_function,convert_element_type.default,backward,22,1,1,1,3719,2448,6 -4024,view_771,call_function,view.default,backward,22,1,1,1,3711,2448,2 -4025,view_772,call_function,view.default,backward,22,1,1,1,3720,2448,5 -4026,view_773,call_function,view.default,backward,22,1,1,1,3720,2447,5 -4027,alias_default_887,call_function,alias.default,backward,22,1,1,2,3712,2447,4 -4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5 -4029,permute_495,call_function,permute.default,backward,22,1,1,1,4,2443,3 -4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5 -4031,permute_496,call_function,permute.default,backward,22,1,1,1,3714,2,4 -4032,dtype_cast_307,call_function,dtype_cast.default,backward,22,1,1,1,3715,1,4 -4033,alias_default_1447,call_function,alias.default,backward,22,1,1,0,3716,0,3 -4034,alias_default_888,call_function,alias.default,backward,22,1,1,2,3721,2447,4 -4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5 -4036,permute_499,call_function,permute.default,backward,22,1,1,1,4,2443,3 -4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5 -4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10 -4039,permute_500,call_function,permute.default,backward,22,1,1,1,3723,2,4 -4040,dtype_cast_308,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4 -4041,alias_default_1446,call_function,alias.default,backward,22,1,1,0,3725,0,3 -4042,alias_default_889,call_function,alias.default,backward,22,1,1,2,3721,2446,4 -4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5 -4044,permute_503,call_function,permute.default,backward,22,1,1,1,4,2442,3 -4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5 -4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10 -4047,permute_504,call_function,permute.default,backward,22,1,1,1,3723,2,4 -4048,dtype_cast_309,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4 -4049,alias_default_1445,call_function,alias.default,backward,22,1,1,0,3725,0,3 -4050,convert_element_type_950,call_function,convert_element_type.default,backward,22,1,1,1,3747,2439,8 -4051,convert_element_type_951,call_function,convert_element_type.default,backward,22,1,1,1,2429,2439,4 -4052,convert_element_type_952,call_function,convert_element_type.default,backward,22,1,1,1,3,2433,2 -4053,alias_default_890,call_function,alias.default,backward,22,1,1,2,3748,2438,4 -4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8 -4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8 -4056,alias_default_891,call_function,alias.default,backward,22,1,1,2,3751,2431,4 -4057,alias_default_892,call_function,alias.default,backward,22,1,1,3,2438,2437,4 -4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8 -4059,sum_25,call_function,sum.dim_IntList,backward,22,1,1,1,3756,2429,5 -4060,div_40,call_function,div.Tensor,backward,22,1,1,1,2439,2429,6 -4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8 -4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10 -4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8 -4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8 -4065,sum_26,call_function,sum.dim_IntList,backward,22,1,1,1,3753,3,5 -4066,convert_element_type_953,call_function,convert_element_type.default,backward,22,1,1,1,3761,2425,6 -4067,convert_element_type_954,call_function,convert_element_type.default,backward,22,1,1,1,3754,2,3 -4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10 -4069,dtype_cast_310,call_function,dtype_cast.default,backward,22,1,1,1,3755,1,3 -4070,alias_default_1452,call_function,alias.default,backward,22,1,1,0,3756,0,2 -4071,alias_default_893,call_function,alias.default,unknown,,1,1,3,3763,2423,4 -4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5 -4073,permute_507,call_function,permute.default,backward,21,1,1,1,4,2419,3 -4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5 -4075,permute_508,call_function,permute.default,backward,21,1,1,1,3765,2,4 -4076,dtype_cast_311,call_function,dtype_cast.default,backward,21,1,1,1,3766,1,4 -4077,alias_default_1441,call_function,alias.default,backward,21,1,1,0,3767,0,3 -4078,alias_default_894,call_function,alias.default,backward,21,1,1,2,3766,2417,4 -4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8 -4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8 -4081,alias_default_895,call_function,alias.default,backward,21,1,1,2,3768,2404,4 -4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5 -4083,permute_511,call_function,permute.default,backward,21,1,1,1,4,2400,3 -4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5 -4085,permute_512,call_function,permute.default,backward,21,1,1,1,3770,2,4 -4086,dtype_cast_312,call_function,dtype_cast.default,backward,21,1,1,1,3771,1,4 -4087,alias_default_1442,call_function,alias.default,backward,21,1,1,0,3772,0,3 -4088,convert_element_type_963,call_function,convert_element_type.default,backward,21,1,1,1,3768,2408,6 -4089,convert_element_type_964,call_function,convert_element_type.default,backward,21,1,1,1,2406,2418,4 -4090,alias_default_896,call_function,alias.default,backward,21,1,1,2,2407,2417,4 -4091,neg_34,call_function,neg.default,backward,21,1,1,1,2408,2416,8 -4092,exp_34,call_function,exp.default,backward,21,1,1,1,2409,2415,6 -4093,add_183,call_function,add.Tensor,backward,21,1,1,1,2410,2414,4 -4094,reciprocal_6,call_function,reciprocal.default,backward,21,1,1,1,2411,2413,4 -4095,mul_326,call_function,mul.Tensor,backward,21,1,1,1,2412,2412,6 -4096,alias_default_897,call_function,alias.default,backward,21,1,1,2,2413,2411,4 -4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8 -4098,sub_19,call_function,sub.Tensor,backward,21,1,1,1,2414,2409,4 -4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8 -4100,add_184,call_function,add.Tensor,backward,21,1,1,1,2416,2407,4 -4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8 -4102,convert_element_type_965,call_function,convert_element_type.default,backward,21,1,1,1,3782,2405,6 -4103,alias_default_898,call_function,alias.default,backward,21,1,1,2,3783,2404,4 -4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5 -4105,permute_515,call_function,permute.default,backward,21,1,1,1,4,2400,3 -4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5 -4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10 -4108,permute_516,call_function,permute.default,backward,21,1,1,1,3785,2,4 -4109,dtype_cast_313,call_function,dtype_cast.default,backward,21,1,1,1,3786,1,4 -4110,alias_default_1440,call_function,alias.default,backward,21,1,1,0,3787,0,3 -4111,convert_element_type_970,call_function,convert_element_type.default,backward,21,1,1,1,3791,2397,8 -4112,convert_element_type_971,call_function,convert_element_type.default,backward,21,1,1,1,2386,2397,4 -4113,convert_element_type_972,call_function,convert_element_type.default,backward,21,1,1,1,3,2391,2 -4114,alias_default_899,call_function,alias.default,backward,21,1,1,2,3792,2396,4 -4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8 -4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8 -4117,alias_default_900,call_function,alias.default,backward,21,1,1,2,3795,2389,4 -4118,alias_default_901,call_function,alias.default,backward,21,1,1,3,2395,2395,4 -4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8 -4120,sum_27,call_function,sum.dim_IntList,backward,21,1,1,1,3800,2387,5 -4121,div_41,call_function,div.Tensor,backward,21,1,1,1,2396,2387,6 -4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8 -4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10 -4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8 -4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8 -4126,sum_28,call_function,sum.dim_IntList,backward,21,1,1,1,3797,3,5 -4127,convert_element_type_973,call_function,convert_element_type.default,backward,21,1,1,1,3805,2383,6 -4128,convert_element_type_974,call_function,convert_element_type.default,backward,21,1,1,1,3798,2,3 -4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10 -4130,dtype_cast_314,call_function,dtype_cast.default,backward,21,1,1,1,3799,1,3 -4131,alias_default_1444,call_function,alias.default,backward,21,1,1,0,3800,0,2 -4132,alias_default_902,call_function,alias.default,unknown,,1,1,3,3807,2381,4 -4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5 -4134,permute_519,call_function,permute.default,backward,21,1,1,1,4,2377,3 -4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5 -4136,permute_520,call_function,permute.default,backward,21,1,1,1,3809,2,4 -4137,dtype_cast_315,call_function,dtype_cast.default,backward,21,1,1,1,3810,1,4 -4138,alias_default_1439,call_function,alias.default,backward,21,1,1,0,3811,0,3 -4139,view_788,call_function,view.default,backward,21,1,1,1,3810,2375,4 -4140,permute_521,call_function,permute.default,backward,21,1,1,1,3811,2374,4 -4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2 -4142,getitem_270,call_function,getitem,backward,21,1,1,1,3816,2346,2 -4143,getitem_271,call_function,getitem,backward,21,1,1,1,3816,2347,2 -4144,getitem_272,call_function,getitem,backward,21,1,1,1,3816,2340,2 -4145,permute_522,call_function,permute.default,backward,21,1,1,1,3817,2339,2 -4146,permute_523,call_function,permute.default,backward,21,1,1,1,3817,2346,2 -4147,permute_524,call_function,permute.default,backward,21,1,1,1,3817,2345,2 -4148,convert_element_type_979,call_function,convert_element_type.default,backward,21,1,1,1,3818,2345,2 -4149,convert_element_type_980,call_function,convert_element_type.default,backward,21,1,1,1,3818,2344,2 -4150,view_789,call_function,view.default,backward,21,1,1,1,3819,2344,2 -4151,view_as_complex_68,call_function,view_as_complex.default,backward,21,1,1,1,3820,2343,6 -4152,_conj_12,call_function,_conj.default,backward,21,1,1,1,4,2344,3 -4153,clone_54,call_function,clone.default,backward,21,1,1,1,5,2343,3 -4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8 -4155,view_790,call_function,view.default,backward,21,1,1,1,3819,2343,2 -4156,view_as_complex_69,call_function,view_as_complex.default,backward,21,1,1,1,3820,2342,6 -4157,_conj_13,call_function,_conj.default,backward,21,1,1,1,4,2343,3 -4158,clone_55,call_function,clone.default,backward,21,1,1,1,5,2342,3 -4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8 -4160,view_as_real_68,call_function,view_as_real.default,backward,21,1,1,1,3824,2341,6 -4161,view_791,call_function,view.default,backward,21,1,1,1,3825,2340,6 -4162,convert_element_type_981,call_function,convert_element_type.default,backward,21,1,1,1,3826,2339,6 -4163,view_as_real_69,call_function,view_as_real.default,backward,21,1,1,1,3824,2340,6 -4164,view_792,call_function,view.default,backward,21,1,1,1,3825,2339,6 -4165,convert_element_type_982,call_function,convert_element_type.default,backward,21,1,1,1,3826,2338,6 -4166,view_793,call_function,view.default,backward,21,1,1,1,3818,2338,2 -4167,view_794,call_function,view.default,backward,21,1,1,1,3827,2338,5 -4168,view_795,call_function,view.default,backward,21,1,1,1,3827,2337,5 -4169,alias_default_903,call_function,alias.default,backward,21,1,1,2,3819,2337,4 -4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5 -4171,permute_527,call_function,permute.default,backward,21,1,1,1,4,2333,3 -4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5 -4173,permute_528,call_function,permute.default,backward,21,1,1,1,3821,2,4 -4174,dtype_cast_316,call_function,dtype_cast.default,backward,21,1,1,1,3822,1,4 -4175,alias_default_1438,call_function,alias.default,backward,21,1,1,0,3823,0,3 -4176,alias_default_904,call_function,alias.default,backward,21,1,1,2,3828,2337,4 -4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5 -4178,permute_531,call_function,permute.default,backward,21,1,1,1,4,2333,3 -4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5 -4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10 -4181,permute_532,call_function,permute.default,backward,21,1,1,1,3830,2,4 -4182,dtype_cast_317,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4 -4183,alias_default_1437,call_function,alias.default,backward,21,1,1,0,3832,0,3 -4184,alias_default_905,call_function,alias.default,backward,21,1,1,2,3828,2336,4 -4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5 -4186,permute_535,call_function,permute.default,backward,21,1,1,1,4,2332,3 -4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5 -4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10 -4189,permute_536,call_function,permute.default,backward,21,1,1,1,3830,2,4 -4190,dtype_cast_318,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4 -4191,alias_default_1436,call_function,alias.default,backward,21,1,1,0,3832,0,3 -4192,convert_element_type_995,call_function,convert_element_type.default,backward,21,1,1,1,3854,2329,8 -4193,convert_element_type_996,call_function,convert_element_type.default,backward,21,1,1,1,2319,2329,4 -4194,convert_element_type_997,call_function,convert_element_type.default,backward,21,1,1,1,3,2323,2 -4195,alias_default_906,call_function,alias.default,backward,21,1,1,2,3855,2328,4 -4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8 -4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8 -4198,alias_default_907,call_function,alias.default,backward,21,1,1,2,3858,2321,4 -4199,alias_default_908,call_function,alias.default,backward,21,1,1,3,2328,2327,4 -4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8 -4201,sum_29,call_function,sum.dim_IntList,backward,21,1,1,1,3863,2319,5 -4202,div_42,call_function,div.Tensor,backward,21,1,1,1,2329,2319,6 -4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8 -4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10 -4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8 -4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8 -4207,sum_30,call_function,sum.dim_IntList,backward,21,1,1,1,3860,3,5 -4208,convert_element_type_998,call_function,convert_element_type.default,backward,21,1,1,1,3868,2315,6 -4209,convert_element_type_999,call_function,convert_element_type.default,backward,21,1,1,1,3861,2,3 -4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10 -4211,dtype_cast_319,call_function,dtype_cast.default,backward,21,1,1,1,3862,1,3 -4212,alias_default_1443,call_function,alias.default,backward,21,1,1,0,3863,0,2 -4213,alias_default_909,call_function,alias.default,unknown,,1,1,3,3870,2313,4 -4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5 -4215,permute_539,call_function,permute.default,backward,20,1,1,1,4,2309,3 -4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5 -4217,permute_540,call_function,permute.default,backward,20,1,1,1,3872,2,4 -4218,dtype_cast_320,call_function,dtype_cast.default,backward,20,1,1,1,3873,1,4 -4219,alias_default_1432,call_function,alias.default,backward,20,1,1,0,3874,0,3 -4220,alias_default_910,call_function,alias.default,backward,20,1,1,2,3873,2307,4 -4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8 -4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8 -4223,alias_default_911,call_function,alias.default,backward,20,1,1,2,3875,2294,4 -4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5 -4225,permute_543,call_function,permute.default,backward,20,1,1,1,4,2290,3 -4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5 -4227,permute_544,call_function,permute.default,backward,20,1,1,1,3877,2,4 -4228,dtype_cast_321,call_function,dtype_cast.default,backward,20,1,1,1,3878,1,4 -4229,alias_default_1433,call_function,alias.default,backward,20,1,1,0,3879,0,3 -4230,convert_element_type_1008,call_function,convert_element_type.default,backward,20,1,1,1,3875,2298,6 -4231,convert_element_type_1009,call_function,convert_element_type.default,backward,20,1,1,1,2296,2308,4 -4232,alias_default_912,call_function,alias.default,backward,20,1,1,2,2297,2307,4 -4233,neg_35,call_function,neg.default,backward,20,1,1,1,2298,2306,8 -4234,exp_35,call_function,exp.default,backward,20,1,1,1,2299,2305,6 -4235,add_190,call_function,add.Tensor,backward,20,1,1,1,2300,2304,4 -4236,reciprocal_7,call_function,reciprocal.default,backward,20,1,1,1,2301,2303,4 -4237,mul_346,call_function,mul.Tensor,backward,20,1,1,1,2302,2302,6 -4238,alias_default_913,call_function,alias.default,backward,20,1,1,2,2303,2301,4 -4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8 -4240,sub_22,call_function,sub.Tensor,backward,20,1,1,1,2304,2299,4 -4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8 -4242,add_191,call_function,add.Tensor,backward,20,1,1,1,2306,2297,4 -4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8 -4244,convert_element_type_1010,call_function,convert_element_type.default,backward,20,1,1,1,3889,2295,6 -4245,alias_default_914,call_function,alias.default,backward,20,1,1,2,3890,2294,4 -4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5 -4247,permute_547,call_function,permute.default,backward,20,1,1,1,4,2290,3 -4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5 -4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10 -4250,permute_548,call_function,permute.default,backward,20,1,1,1,3892,2,4 -4251,dtype_cast_322,call_function,dtype_cast.default,backward,20,1,1,1,3893,1,4 -4252,alias_default_1431,call_function,alias.default,backward,20,1,1,0,3894,0,3 -4253,convert_element_type_1015,call_function,convert_element_type.default,backward,20,1,1,1,3898,2287,8 -4254,convert_element_type_1016,call_function,convert_element_type.default,backward,20,1,1,1,2276,2287,4 -4255,convert_element_type_1017,call_function,convert_element_type.default,backward,20,1,1,1,3,2281,2 -4256,alias_default_915,call_function,alias.default,backward,20,1,1,2,3899,2286,4 -4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8 -4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8 -4259,alias_default_916,call_function,alias.default,backward,20,1,1,2,3902,2279,4 -4260,alias_default_917,call_function,alias.default,backward,20,1,1,3,2285,2285,4 -4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8 -4262,sum_31,call_function,sum.dim_IntList,backward,20,1,1,1,3907,2277,5 -4263,div_43,call_function,div.Tensor,backward,20,1,1,1,2286,2277,6 -4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8 -4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10 -4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8 -4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8 -4268,sum_32,call_function,sum.dim_IntList,backward,20,1,1,1,3904,3,5 -4269,convert_element_type_1018,call_function,convert_element_type.default,backward,20,1,1,1,3912,2273,6 -4270,convert_element_type_1019,call_function,convert_element_type.default,backward,20,1,1,1,3905,2,3 -4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10 -4272,dtype_cast_323,call_function,dtype_cast.default,backward,20,1,1,1,3906,1,3 -4273,alias_default_1435,call_function,alias.default,backward,20,1,1,0,3907,0,2 -4274,alias_default_918,call_function,alias.default,unknown,,1,1,3,3914,2271,4 -4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5 -4276,permute_551,call_function,permute.default,backward,20,1,1,1,4,2267,3 -4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5 -4278,permute_552,call_function,permute.default,backward,20,1,1,1,3916,2,4 -4279,dtype_cast_324,call_function,dtype_cast.default,backward,20,1,1,1,3917,1,4 -4280,alias_default_1430,call_function,alias.default,backward,20,1,1,0,3918,0,3 -4281,view_810,call_function,view.default,backward,20,1,1,1,3917,2265,4 -4282,permute_553,call_function,permute.default,backward,20,1,1,1,3918,2264,4 -4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2 -4284,getitem_273,call_function,getitem,backward,20,1,1,1,3923,2236,2 -4285,getitem_274,call_function,getitem,backward,20,1,1,1,3923,2237,2 -4286,getitem_275,call_function,getitem,backward,20,1,1,1,3923,2230,2 -4287,permute_554,call_function,permute.default,backward,20,1,1,1,3924,2229,2 -4288,permute_555,call_function,permute.default,backward,20,1,1,1,3924,2236,2 -4289,permute_556,call_function,permute.default,backward,20,1,1,1,3924,2235,2 -4290,convert_element_type_1024,call_function,convert_element_type.default,backward,20,1,1,1,3925,2235,2 -4291,convert_element_type_1025,call_function,convert_element_type.default,backward,20,1,1,1,3925,2234,2 -4292,view_811,call_function,view.default,backward,20,1,1,1,3926,2234,2 -4293,view_as_complex_70,call_function,view_as_complex.default,backward,20,1,1,1,3927,2233,6 -4294,_conj_14,call_function,_conj.default,backward,20,1,1,1,4,2234,3 -4295,clone_62,call_function,clone.default,backward,20,1,1,1,5,2233,3 -4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8 -4297,view_812,call_function,view.default,backward,20,1,1,1,3926,2233,2 -4298,view_as_complex_71,call_function,view_as_complex.default,backward,20,1,1,1,3927,2232,6 -4299,_conj_15,call_function,_conj.default,backward,20,1,1,1,4,2233,3 -4300,clone_63,call_function,clone.default,backward,20,1,1,1,5,2232,3 -4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8 -4302,view_as_real_70,call_function,view_as_real.default,backward,20,1,1,1,3931,2231,6 -4303,view_813,call_function,view.default,backward,20,1,1,1,3932,2230,6 -4304,convert_element_type_1026,call_function,convert_element_type.default,backward,20,1,1,1,3933,2229,6 -4305,view_as_real_71,call_function,view_as_real.default,backward,20,1,1,1,3931,2230,6 -4306,view_814,call_function,view.default,backward,20,1,1,1,3932,2229,6 -4307,convert_element_type_1027,call_function,convert_element_type.default,backward,20,1,1,1,3933,2228,6 -4308,view_815,call_function,view.default,backward,20,1,1,1,3925,2228,2 -4309,view_816,call_function,view.default,backward,20,1,1,1,3934,2228,5 -4310,view_817,call_function,view.default,backward,20,1,1,1,3934,2227,5 -4311,alias_default_919,call_function,alias.default,backward,20,1,1,2,3926,2227,4 -4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5 -4313,permute_559,call_function,permute.default,backward,20,1,1,1,4,2223,3 -4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5 -4315,permute_560,call_function,permute.default,backward,20,1,1,1,3928,2,4 -4316,dtype_cast_325,call_function,dtype_cast.default,backward,20,1,1,1,3929,1,4 -4317,alias_default_1429,call_function,alias.default,backward,20,1,1,0,3930,0,3 -4318,alias_default_920,call_function,alias.default,backward,20,1,1,2,3935,2227,4 -4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5 -4320,permute_563,call_function,permute.default,backward,20,1,1,1,4,2223,3 -4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5 -4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10 -4323,permute_564,call_function,permute.default,backward,20,1,1,1,3937,2,4 -4324,dtype_cast_326,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4 -4325,alias_default_1428,call_function,alias.default,backward,20,1,1,0,3939,0,3 -4326,alias_default_921,call_function,alias.default,backward,20,1,1,2,3935,2226,4 -4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5 -4328,permute_567,call_function,permute.default,backward,20,1,1,1,4,2222,3 -4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5 -4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10 -4331,permute_568,call_function,permute.default,backward,20,1,1,1,3937,2,4 -4332,dtype_cast_327,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4 -4333,alias_default_1427,call_function,alias.default,backward,20,1,1,0,3939,0,3 -4334,convert_element_type_1040,call_function,convert_element_type.default,backward,20,1,1,1,3961,2219,8 -4335,convert_element_type_1041,call_function,convert_element_type.default,backward,20,1,1,1,2209,2219,4 -4336,convert_element_type_1042,call_function,convert_element_type.default,backward,20,1,1,1,3,2213,2 -4337,alias_default_922,call_function,alias.default,backward,20,1,1,2,3962,2218,4 -4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8 -4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8 -4340,alias_default_923,call_function,alias.default,backward,20,1,1,2,3965,2211,4 -4341,alias_default_924,call_function,alias.default,backward,20,1,1,3,2218,2217,4 -4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8 -4343,sum_33,call_function,sum.dim_IntList,backward,20,1,1,1,3970,2209,5 -4344,div_44,call_function,div.Tensor,backward,20,1,1,1,2219,2209,6 -4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8 -4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10 -4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8 -4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8 -4349,sum_34,call_function,sum.dim_IntList,backward,20,1,1,1,3967,3,5 -4350,convert_element_type_1043,call_function,convert_element_type.default,backward,20,1,1,1,3975,2205,6 -4351,convert_element_type_1044,call_function,convert_element_type.default,backward,20,1,1,1,3968,2,3 -4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10 -4353,dtype_cast_328,call_function,dtype_cast.default,backward,20,1,1,1,3969,1,3 -4354,alias_default_1434,call_function,alias.default,backward,20,1,1,0,3970,0,2 -4355,alias_default_925,call_function,alias.default,unknown,,1,1,3,3977,2203,4 -4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5 -4357,permute_571,call_function,permute.default,backward,19,1,1,1,4,2199,3 -4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5 -4359,permute_572,call_function,permute.default,backward,19,1,1,1,3979,2,4 -4360,dtype_cast_329,call_function,dtype_cast.default,backward,19,1,1,1,3980,1,4 -4361,alias_default_1423,call_function,alias.default,backward,19,1,1,0,3981,0,3 -4362,alias_default_926,call_function,alias.default,backward,19,1,1,2,3980,2197,4 -4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8 -4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8 -4365,alias_default_927,call_function,alias.default,backward,19,1,1,2,3982,2184,4 -4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5 -4367,permute_575,call_function,permute.default,backward,19,1,1,1,4,2180,3 -4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5 -4369,permute_576,call_function,permute.default,backward,19,1,1,1,3984,2,4 -4370,dtype_cast_330,call_function,dtype_cast.default,backward,19,1,1,1,3985,1,4 -4371,alias_default_1424,call_function,alias.default,backward,19,1,1,0,3986,0,3 -4372,convert_element_type_1053,call_function,convert_element_type.default,backward,19,1,1,1,3982,2188,6 -4373,convert_element_type_1054,call_function,convert_element_type.default,backward,19,1,1,1,2186,2198,4 -4374,alias_default_928,call_function,alias.default,backward,19,1,1,2,2187,2197,4 -4375,neg_36,call_function,neg.default,backward,19,1,1,1,2188,2196,8 -4376,exp_36,call_function,exp.default,backward,19,1,1,1,2189,2195,6 -4377,add_197,call_function,add.Tensor,backward,19,1,1,1,2190,2194,4 -4378,reciprocal_8,call_function,reciprocal.default,backward,19,1,1,1,2191,2193,4 -4379,mul_366,call_function,mul.Tensor,backward,19,1,1,1,2192,2192,6 -4380,alias_default_929,call_function,alias.default,backward,19,1,1,2,2193,2191,4 -4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8 -4382,sub_25,call_function,sub.Tensor,backward,19,1,1,1,2194,2189,4 -4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8 -4384,add_198,call_function,add.Tensor,backward,19,1,1,1,2196,2187,4 -4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8 -4386,convert_element_type_1055,call_function,convert_element_type.default,backward,19,1,1,1,3996,2185,6 -4387,alias_default_930,call_function,alias.default,backward,19,1,1,2,3997,2184,4 -4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5 -4389,permute_579,call_function,permute.default,backward,19,1,1,1,4,2180,3 -4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5 -4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10 -4392,permute_580,call_function,permute.default,backward,19,1,1,1,3999,2,4 -4393,dtype_cast_331,call_function,dtype_cast.default,backward,19,1,1,1,4000,1,4 -4394,alias_default_1422,call_function,alias.default,backward,19,1,1,0,4001,0,3 -4395,convert_element_type_1060,call_function,convert_element_type.default,backward,19,1,1,1,4005,2177,8 -4396,convert_element_type_1061,call_function,convert_element_type.default,backward,19,1,1,1,2166,2177,4 -4397,convert_element_type_1062,call_function,convert_element_type.default,backward,19,1,1,1,3,2171,2 -4398,alias_default_931,call_function,alias.default,backward,19,1,1,2,4006,2176,4 -4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8 -4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8 -4401,alias_default_932,call_function,alias.default,backward,19,1,1,2,4009,2169,4 -4402,alias_default_933,call_function,alias.default,backward,19,1,1,3,2175,2175,4 -4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8 -4404,sum_35,call_function,sum.dim_IntList,backward,19,1,1,1,4014,2167,5 -4405,div_45,call_function,div.Tensor,backward,19,1,1,1,2176,2167,6 -4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8 -4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10 -4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8 -4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8 -4410,sum_36,call_function,sum.dim_IntList,backward,19,1,1,1,4011,3,5 -4411,convert_element_type_1063,call_function,convert_element_type.default,backward,19,1,1,1,4019,2163,6 -4412,convert_element_type_1064,call_function,convert_element_type.default,backward,19,1,1,1,4012,2,3 -4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10 -4414,dtype_cast_332,call_function,dtype_cast.default,backward,19,1,1,1,4013,1,3 -4415,alias_default_1426,call_function,alias.default,backward,19,1,1,0,4014,0,2 -4416,alias_default_934,call_function,alias.default,unknown,,1,1,3,4021,2161,4 -4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5 -4418,permute_583,call_function,permute.default,backward,19,1,1,1,4,2157,3 -4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5 -4420,permute_584,call_function,permute.default,backward,19,1,1,1,4023,2,4 -4421,dtype_cast_333,call_function,dtype_cast.default,backward,19,1,1,1,4024,1,4 -4422,alias_default_1421,call_function,alias.default,backward,19,1,1,0,4025,0,3 -4423,view_832,call_function,view.default,backward,19,1,1,1,4024,2155,4 -4424,permute_585,call_function,permute.default,backward,19,1,1,1,4025,2154,4 -4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2 -4426,getitem_276,call_function,getitem,backward,19,1,1,1,4030,2126,2 -4427,getitem_277,call_function,getitem,backward,19,1,1,1,4030,2127,2 -4428,getitem_278,call_function,getitem,backward,19,1,1,1,4030,2120,2 -4429,permute_586,call_function,permute.default,backward,19,1,1,1,4031,2119,2 -4430,permute_587,call_function,permute.default,backward,19,1,1,1,4031,2126,2 -4431,permute_588,call_function,permute.default,backward,19,1,1,1,4031,2125,2 -4432,convert_element_type_1069,call_function,convert_element_type.default,backward,19,1,1,1,4032,2125,2 -4433,convert_element_type_1070,call_function,convert_element_type.default,backward,19,1,1,1,4032,2124,2 -4434,view_833,call_function,view.default,backward,19,1,1,1,4033,2124,2 -4435,view_as_complex_72,call_function,view_as_complex.default,backward,19,1,1,1,4034,2123,6 -4436,_conj_16,call_function,_conj.default,backward,19,1,1,1,4,2124,3 -4437,clone_70,call_function,clone.default,backward,19,1,1,1,5,2123,3 -4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8 -4439,view_834,call_function,view.default,backward,19,1,1,1,4033,2123,2 -4440,view_as_complex_73,call_function,view_as_complex.default,backward,19,1,1,1,4034,2122,6 -4441,_conj_17,call_function,_conj.default,backward,19,1,1,1,4,2123,3 -4442,clone_71,call_function,clone.default,backward,19,1,1,1,5,2122,3 -4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8 -4444,view_as_real_72,call_function,view_as_real.default,backward,19,1,1,1,4038,2121,6 -4445,view_835,call_function,view.default,backward,19,1,1,1,4039,2120,6 -4446,convert_element_type_1071,call_function,convert_element_type.default,backward,19,1,1,1,4040,2119,6 -4447,view_as_real_73,call_function,view_as_real.default,backward,19,1,1,1,4038,2120,6 -4448,view_836,call_function,view.default,backward,19,1,1,1,4039,2119,6 -4449,convert_element_type_1072,call_function,convert_element_type.default,backward,19,1,1,1,4040,2118,6 -4450,view_837,call_function,view.default,backward,19,1,1,1,4032,2118,2 -4451,view_838,call_function,view.default,backward,19,1,1,1,4041,2118,5 -4452,view_839,call_function,view.default,backward,19,1,1,1,4041,2117,5 -4453,alias_default_935,call_function,alias.default,backward,19,1,1,2,4033,2117,4 -4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5 -4455,permute_591,call_function,permute.default,backward,19,1,1,1,4,2113,3 -4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5 -4457,permute_592,call_function,permute.default,backward,19,1,1,1,4035,2,4 -4458,dtype_cast_334,call_function,dtype_cast.default,backward,19,1,1,1,4036,1,4 -4459,alias_default_1420,call_function,alias.default,backward,19,1,1,0,4037,0,3 -4460,alias_default_936,call_function,alias.default,backward,19,1,1,2,4042,2117,4 -4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5 -4462,permute_595,call_function,permute.default,backward,19,1,1,1,4,2113,3 -4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5 -4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10 -4465,permute_596,call_function,permute.default,backward,19,1,1,1,4044,2,4 -4466,dtype_cast_335,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4 -4467,alias_default_1419,call_function,alias.default,backward,19,1,1,0,4046,0,3 -4468,alias_default_937,call_function,alias.default,backward,19,1,1,2,4042,2116,4 -4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5 -4470,permute_599,call_function,permute.default,backward,19,1,1,1,4,2112,3 -4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5 -4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10 -4473,permute_600,call_function,permute.default,backward,19,1,1,1,4044,2,4 -4474,dtype_cast_336,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4 -4475,alias_default_1418,call_function,alias.default,backward,19,1,1,0,4046,0,3 -4476,convert_element_type_1085,call_function,convert_element_type.default,backward,19,1,1,1,4068,2109,8 -4477,convert_element_type_1086,call_function,convert_element_type.default,backward,19,1,1,1,2099,2109,4 -4478,convert_element_type_1087,call_function,convert_element_type.default,backward,19,1,1,1,3,2103,2 -4479,alias_default_938,call_function,alias.default,backward,19,1,1,2,4069,2108,4 -4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8 -4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8 -4482,alias_default_939,call_function,alias.default,backward,19,1,1,2,4072,2101,4 -4483,alias_default_940,call_function,alias.default,backward,19,1,1,3,2108,2107,4 -4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8 -4485,sum_37,call_function,sum.dim_IntList,backward,19,1,1,1,4077,2099,5 -4486,div_46,call_function,div.Tensor,backward,19,1,1,1,2109,2099,6 -4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8 -4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10 -4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8 -4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8 -4491,sum_38,call_function,sum.dim_IntList,backward,19,1,1,1,4074,3,5 -4492,convert_element_type_1088,call_function,convert_element_type.default,backward,19,1,1,1,4082,2095,6 -4493,convert_element_type_1089,call_function,convert_element_type.default,backward,19,1,1,1,4075,2,3 -4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10 -4495,dtype_cast_337,call_function,dtype_cast.default,backward,19,1,1,1,4076,1,3 -4496,alias_default_1425,call_function,alias.default,backward,19,1,1,0,4077,0,2 -4497,alias_default_941,call_function,alias.default,unknown,,1,1,3,4084,2093,4 -4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5 -4499,permute_603,call_function,permute.default,backward,18,1,1,1,4,2089,3 -4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5 -4501,permute_604,call_function,permute.default,backward,18,1,1,1,4086,2,4 -4502,dtype_cast_338,call_function,dtype_cast.default,backward,18,1,1,1,4087,1,4 -4503,alias_default_1414,call_function,alias.default,backward,18,1,1,0,4088,0,3 -4504,alias_default_942,call_function,alias.default,backward,18,1,1,2,4087,2087,4 -4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8 -4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8 -4507,alias_default_943,call_function,alias.default,backward,18,1,1,2,4089,2074,4 -4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5 -4509,permute_607,call_function,permute.default,backward,18,1,1,1,4,2070,3 -4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5 -4511,permute_608,call_function,permute.default,backward,18,1,1,1,4091,2,4 -4512,dtype_cast_339,call_function,dtype_cast.default,backward,18,1,1,1,4092,1,4 -4513,alias_default_1415,call_function,alias.default,backward,18,1,1,0,4093,0,3 -4514,convert_element_type_1098,call_function,convert_element_type.default,backward,18,1,1,1,4089,2078,6 -4515,convert_element_type_1099,call_function,convert_element_type.default,backward,18,1,1,1,2076,2088,4 -4516,alias_default_944,call_function,alias.default,backward,18,1,1,2,2077,2087,4 -4517,neg_37,call_function,neg.default,backward,18,1,1,1,2078,2086,8 -4518,exp_37,call_function,exp.default,backward,18,1,1,1,2079,2085,6 -4519,add_204,call_function,add.Tensor,backward,18,1,1,1,2080,2084,4 -4520,reciprocal_9,call_function,reciprocal.default,backward,18,1,1,1,2081,2083,4 -4521,mul_386,call_function,mul.Tensor,backward,18,1,1,1,2082,2082,6 -4522,alias_default_945,call_function,alias.default,backward,18,1,1,2,2083,2081,4 -4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8 -4524,sub_28,call_function,sub.Tensor,backward,18,1,1,1,2084,2079,4 -4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8 -4526,add_205,call_function,add.Tensor,backward,18,1,1,1,2086,2077,4 -4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8 -4528,convert_element_type_1100,call_function,convert_element_type.default,backward,18,1,1,1,4103,2075,6 -4529,alias_default_946,call_function,alias.default,backward,18,1,1,2,4104,2074,4 -4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5 -4531,permute_611,call_function,permute.default,backward,18,1,1,1,4,2070,3 -4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5 -4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10 -4534,permute_612,call_function,permute.default,backward,18,1,1,1,4106,2,4 -4535,dtype_cast_340,call_function,dtype_cast.default,backward,18,1,1,1,4107,1,4 -4536,alias_default_1413,call_function,alias.default,backward,18,1,1,0,4108,0,3 -4537,convert_element_type_1105,call_function,convert_element_type.default,backward,18,1,1,1,4112,2067,8 -4538,convert_element_type_1106,call_function,convert_element_type.default,backward,18,1,1,1,2056,2067,4 -4539,convert_element_type_1107,call_function,convert_element_type.default,backward,18,1,1,1,3,2061,2 -4540,alias_default_947,call_function,alias.default,backward,18,1,1,2,4113,2066,4 -4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8 -4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8 -4543,alias_default_948,call_function,alias.default,backward,18,1,1,2,4116,2059,4 -4544,alias_default_949,call_function,alias.default,backward,18,1,1,3,2065,2065,4 -4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8 -4546,sum_39,call_function,sum.dim_IntList,backward,18,1,1,1,4121,2057,5 -4547,div_47,call_function,div.Tensor,backward,18,1,1,1,2066,2057,6 -4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8 -4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10 -4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8 -4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8 -4552,sum_40,call_function,sum.dim_IntList,backward,18,1,1,1,4118,3,5 -4553,convert_element_type_1108,call_function,convert_element_type.default,backward,18,1,1,1,4126,2053,6 -4554,convert_element_type_1109,call_function,convert_element_type.default,backward,18,1,1,1,4119,2,3 -4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10 -4556,dtype_cast_341,call_function,dtype_cast.default,backward,18,1,1,1,4120,1,3 -4557,alias_default_1417,call_function,alias.default,backward,18,1,1,0,4121,0,2 -4558,alias_default_950,call_function,alias.default,unknown,,1,1,3,4128,2051,4 -4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5 -4560,permute_615,call_function,permute.default,backward,18,1,1,1,4,2047,3 -4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5 -4562,permute_616,call_function,permute.default,backward,18,1,1,1,4130,2,4 -4563,dtype_cast_342,call_function,dtype_cast.default,backward,18,1,1,1,4131,1,4 -4564,alias_default_1412,call_function,alias.default,backward,18,1,1,0,4132,0,3 -4565,view_854,call_function,view.default,backward,18,1,1,1,4131,2045,4 -4566,permute_617,call_function,permute.default,backward,18,1,1,1,4132,2044,4 -4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2 -4568,getitem_279,call_function,getitem,backward,18,1,1,1,4137,2016,2 -4569,getitem_280,call_function,getitem,backward,18,1,1,1,4137,2017,2 -4570,getitem_281,call_function,getitem,backward,18,1,1,1,4137,2010,2 -4571,permute_618,call_function,permute.default,backward,18,1,1,1,4138,2009,2 -4572,permute_619,call_function,permute.default,backward,18,1,1,1,4138,2016,2 -4573,permute_620,call_function,permute.default,backward,18,1,1,1,4138,2015,2 -4574,convert_element_type_1114,call_function,convert_element_type.default,backward,18,1,1,1,4139,2015,2 -4575,convert_element_type_1115,call_function,convert_element_type.default,backward,18,1,1,1,4139,2014,2 -4576,view_855,call_function,view.default,backward,18,1,1,1,4140,2014,2 -4577,view_as_complex_74,call_function,view_as_complex.default,backward,18,1,1,1,4141,2013,6 -4578,_conj_18,call_function,_conj.default,backward,18,1,1,1,4,2014,3 -4579,clone_78,call_function,clone.default,backward,18,1,1,1,5,2013,3 -4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8 -4581,view_856,call_function,view.default,backward,18,1,1,1,4140,2013,2 -4582,view_as_complex_75,call_function,view_as_complex.default,backward,18,1,1,1,4141,2012,6 -4583,_conj_19,call_function,_conj.default,backward,18,1,1,1,4,2013,3 -4584,clone_79,call_function,clone.default,backward,18,1,1,1,5,2012,3 -4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8 -4586,view_as_real_74,call_function,view_as_real.default,backward,18,1,1,1,4145,2011,6 -4587,view_857,call_function,view.default,backward,18,1,1,1,4146,2010,6 -4588,convert_element_type_1116,call_function,convert_element_type.default,backward,18,1,1,1,4147,2009,6 -4589,view_as_real_75,call_function,view_as_real.default,backward,18,1,1,1,4145,2010,6 -4590,view_858,call_function,view.default,backward,18,1,1,1,4146,2009,6 -4591,convert_element_type_1117,call_function,convert_element_type.default,backward,18,1,1,1,4147,2008,6 -4592,view_859,call_function,view.default,backward,18,1,1,1,4139,2008,2 -4593,view_860,call_function,view.default,backward,18,1,1,1,4148,2008,5 -4594,view_861,call_function,view.default,backward,18,1,1,1,4148,2007,5 -4595,alias_default_951,call_function,alias.default,backward,18,1,1,2,4140,2007,4 -4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5 -4597,permute_623,call_function,permute.default,backward,18,1,1,1,4,2003,3 -4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5 -4599,permute_624,call_function,permute.default,backward,18,1,1,1,4142,2,4 -4600,dtype_cast_343,call_function,dtype_cast.default,backward,18,1,1,1,4143,1,4 -4601,alias_default_1411,call_function,alias.default,backward,18,1,1,0,4144,0,3 -4602,alias_default_952,call_function,alias.default,backward,18,1,1,2,4149,2007,4 -4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5 -4604,permute_627,call_function,permute.default,backward,18,1,1,1,4,2003,3 -4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5 -4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10 -4607,permute_628,call_function,permute.default,backward,18,1,1,1,4151,2,4 -4608,dtype_cast_344,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4 -4609,alias_default_1410,call_function,alias.default,backward,18,1,1,0,4153,0,3 -4610,alias_default_953,call_function,alias.default,backward,18,1,1,2,4149,2006,4 -4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5 -4612,permute_631,call_function,permute.default,backward,18,1,1,1,4,2002,3 -4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5 -4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10 -4615,permute_632,call_function,permute.default,backward,18,1,1,1,4151,2,4 -4616,dtype_cast_345,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4 -4617,alias_default_1409,call_function,alias.default,backward,18,1,1,0,4153,0,3 -4618,convert_element_type_1130,call_function,convert_element_type.default,backward,18,1,1,1,4175,1999,8 -4619,convert_element_type_1131,call_function,convert_element_type.default,backward,18,1,1,1,1989,1999,4 -4620,convert_element_type_1132,call_function,convert_element_type.default,backward,18,1,1,1,3,1993,2 -4621,alias_default_954,call_function,alias.default,backward,18,1,1,2,4176,1998,4 -4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8 -4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8 -4624,alias_default_955,call_function,alias.default,backward,18,1,1,2,4179,1991,4 -4625,alias_default_956,call_function,alias.default,backward,18,1,1,3,1998,1997,4 -4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8 -4627,sum_41,call_function,sum.dim_IntList,backward,18,1,1,1,4184,1989,5 -4628,div_48,call_function,div.Tensor,backward,18,1,1,1,1999,1989,6 -4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8 -4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10 -4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8 -4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8 -4633,sum_42,call_function,sum.dim_IntList,backward,18,1,1,1,4181,3,5 -4634,convert_element_type_1133,call_function,convert_element_type.default,backward,18,1,1,1,4189,1985,6 -4635,convert_element_type_1134,call_function,convert_element_type.default,backward,18,1,1,1,4182,2,3 -4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10 -4637,dtype_cast_346,call_function,dtype_cast.default,backward,18,1,1,1,4183,1,3 -4638,alias_default_1416,call_function,alias.default,backward,18,1,1,0,4184,0,2 -4639,alias_default_957,call_function,alias.default,unknown,,1,1,3,4191,1983,4 -4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5 -4641,permute_635,call_function,permute.default,backward,17,1,1,1,4,1979,3 -4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5 -4643,permute_636,call_function,permute.default,backward,17,1,1,1,4193,2,4 -4644,dtype_cast_347,call_function,dtype_cast.default,backward,17,1,1,1,4194,1,4 -4645,alias_default_1405,call_function,alias.default,backward,17,1,1,0,4195,0,3 -4646,alias_default_958,call_function,alias.default,backward,17,1,1,2,4194,1977,4 -4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8 -4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8 -4649,alias_default_959,call_function,alias.default,backward,17,1,1,2,4196,1964,4 -4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5 -4651,permute_639,call_function,permute.default,backward,17,1,1,1,4,1960,3 -4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5 -4653,permute_640,call_function,permute.default,backward,17,1,1,1,4198,2,4 -4654,dtype_cast_348,call_function,dtype_cast.default,backward,17,1,1,1,4199,1,4 -4655,alias_default_1406,call_function,alias.default,backward,17,1,1,0,4200,0,3 -4656,convert_element_type_1143,call_function,convert_element_type.default,backward,17,1,1,1,4196,1968,6 -4657,convert_element_type_1144,call_function,convert_element_type.default,backward,17,1,1,1,1966,1978,4 -4658,alias_default_960,call_function,alias.default,backward,17,1,1,2,1967,1977,4 -4659,neg_38,call_function,neg.default,backward,17,1,1,1,1968,1976,8 -4660,exp_38,call_function,exp.default,backward,17,1,1,1,1969,1975,6 -4661,add_211,call_function,add.Tensor,backward,17,1,1,1,1970,1974,4 -4662,reciprocal_10,call_function,reciprocal.default,backward,17,1,1,1,1971,1973,4 -4663,mul_406,call_function,mul.Tensor,backward,17,1,1,1,1972,1972,6 -4664,alias_default_961,call_function,alias.default,backward,17,1,1,2,1973,1971,4 -4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8 -4666,sub_31,call_function,sub.Tensor,backward,17,1,1,1,1974,1969,4 -4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8 -4668,add_212,call_function,add.Tensor,backward,17,1,1,1,1976,1967,4 -4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8 -4670,convert_element_type_1145,call_function,convert_element_type.default,backward,17,1,1,1,4210,1965,6 -4671,alias_default_962,call_function,alias.default,backward,17,1,1,2,4211,1964,4 -4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5 -4673,permute_643,call_function,permute.default,backward,17,1,1,1,4,1960,3 -4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5 -4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10 -4676,permute_644,call_function,permute.default,backward,17,1,1,1,4213,2,4 -4677,dtype_cast_349,call_function,dtype_cast.default,backward,17,1,1,1,4214,1,4 -4678,alias_default_1404,call_function,alias.default,backward,17,1,1,0,4215,0,3 -4679,convert_element_type_1150,call_function,convert_element_type.default,backward,17,1,1,1,4219,1957,8 -4680,convert_element_type_1151,call_function,convert_element_type.default,backward,17,1,1,1,1946,1957,4 -4681,convert_element_type_1152,call_function,convert_element_type.default,backward,17,1,1,1,3,1951,2 -4682,alias_default_963,call_function,alias.default,backward,17,1,1,2,4220,1956,4 -4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8 -4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8 -4685,alias_default_964,call_function,alias.default,backward,17,1,1,2,4223,1949,4 -4686,alias_default_965,call_function,alias.default,backward,17,1,1,3,1955,1955,4 -4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8 -4688,sum_43,call_function,sum.dim_IntList,backward,17,1,1,1,4228,1947,5 -4689,div_49,call_function,div.Tensor,backward,17,1,1,1,1956,1947,6 -4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8 -4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10 -4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8 -4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8 -4694,sum_44,call_function,sum.dim_IntList,backward,17,1,1,1,4225,3,5 -4695,convert_element_type_1153,call_function,convert_element_type.default,backward,17,1,1,1,4233,1943,6 -4696,convert_element_type_1154,call_function,convert_element_type.default,backward,17,1,1,1,4226,2,3 -4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10 -4698,dtype_cast_350,call_function,dtype_cast.default,backward,17,1,1,1,4227,1,3 -4699,alias_default_1408,call_function,alias.default,backward,17,1,1,0,4228,0,2 -4700,alias_default_966,call_function,alias.default,unknown,,1,1,3,4235,1941,4 -4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5 -4702,permute_647,call_function,permute.default,backward,17,1,1,1,4,1937,3 -4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5 -4704,permute_648,call_function,permute.default,backward,17,1,1,1,4237,2,4 -4705,dtype_cast_351,call_function,dtype_cast.default,backward,17,1,1,1,4238,1,4 -4706,alias_default_1403,call_function,alias.default,backward,17,1,1,0,4239,0,3 -4707,view_876,call_function,view.default,backward,17,1,1,1,4238,1935,4 -4708,permute_649,call_function,permute.default,backward,17,1,1,1,4239,1934,4 -4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2 -4710,getitem_282,call_function,getitem,backward,17,1,1,1,4244,1906,2 -4711,getitem_283,call_function,getitem,backward,17,1,1,1,4244,1907,2 -4712,getitem_284,call_function,getitem,backward,17,1,1,1,4244,1900,2 -4713,permute_650,call_function,permute.default,backward,17,1,1,1,4245,1899,2 -4714,permute_651,call_function,permute.default,backward,17,1,1,1,4245,1906,2 -4715,permute_652,call_function,permute.default,backward,17,1,1,1,4245,1905,2 -4716,convert_element_type_1159,call_function,convert_element_type.default,backward,17,1,1,1,4246,1905,2 -4717,convert_element_type_1160,call_function,convert_element_type.default,backward,17,1,1,1,4246,1904,2 -4718,view_877,call_function,view.default,backward,17,1,1,1,4247,1904,2 -4719,view_as_complex_76,call_function,view_as_complex.default,backward,17,1,1,1,4248,1903,6 -4720,_conj_20,call_function,_conj.default,backward,17,1,1,1,4,1904,3 -4721,clone_86,call_function,clone.default,backward,17,1,1,1,5,1903,3 -4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8 -4723,view_878,call_function,view.default,backward,17,1,1,1,4247,1903,2 -4724,view_as_complex_77,call_function,view_as_complex.default,backward,17,1,1,1,4248,1902,6 -4725,_conj_21,call_function,_conj.default,backward,17,1,1,1,4,1903,3 -4726,clone_87,call_function,clone.default,backward,17,1,1,1,5,1902,3 -4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8 -4728,view_as_real_76,call_function,view_as_real.default,backward,17,1,1,1,4252,1901,6 -4729,view_879,call_function,view.default,backward,17,1,1,1,4253,1900,6 -4730,convert_element_type_1161,call_function,convert_element_type.default,backward,17,1,1,1,4254,1899,6 -4731,view_as_real_77,call_function,view_as_real.default,backward,17,1,1,1,4252,1900,6 -4732,view_880,call_function,view.default,backward,17,1,1,1,4253,1899,6 -4733,convert_element_type_1162,call_function,convert_element_type.default,backward,17,1,1,1,4254,1898,6 -4734,view_881,call_function,view.default,backward,17,1,1,1,4246,1898,2 -4735,view_882,call_function,view.default,backward,17,1,1,1,4255,1898,5 -4736,view_883,call_function,view.default,backward,17,1,1,1,4255,1897,5 -4737,alias_default_967,call_function,alias.default,backward,17,1,1,2,4247,1897,4 -4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5 -4739,permute_655,call_function,permute.default,backward,17,1,1,1,4,1893,3 -4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5 -4741,permute_656,call_function,permute.default,backward,17,1,1,1,4249,2,4 -4742,dtype_cast_352,call_function,dtype_cast.default,backward,17,1,1,1,4250,1,4 -4743,alias_default_1402,call_function,alias.default,backward,17,1,1,0,4251,0,3 -4744,alias_default_968,call_function,alias.default,backward,17,1,1,2,4256,1897,4 -4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5 -4746,permute_659,call_function,permute.default,backward,17,1,1,1,4,1893,3 -4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5 -4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10 -4749,permute_660,call_function,permute.default,backward,17,1,1,1,4258,2,4 -4750,dtype_cast_353,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4 -4751,alias_default_1401,call_function,alias.default,backward,17,1,1,0,4260,0,3 -4752,alias_default_969,call_function,alias.default,backward,17,1,1,2,4256,1896,4 -4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5 -4754,permute_663,call_function,permute.default,backward,17,1,1,1,4,1892,3 -4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5 -4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10 -4757,permute_664,call_function,permute.default,backward,17,1,1,1,4258,2,4 -4758,dtype_cast_354,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4 -4759,alias_default_1400,call_function,alias.default,backward,17,1,1,0,4260,0,3 -4760,convert_element_type_1175,call_function,convert_element_type.default,backward,17,1,1,1,4282,1889,8 -4761,convert_element_type_1176,call_function,convert_element_type.default,backward,17,1,1,1,1879,1889,4 -4762,convert_element_type_1177,call_function,convert_element_type.default,backward,17,1,1,1,3,1883,2 -4763,alias_default_970,call_function,alias.default,backward,17,1,1,2,4283,1888,4 -4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8 -4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8 -4766,alias_default_971,call_function,alias.default,backward,17,1,1,2,4286,1881,4 -4767,alias_default_972,call_function,alias.default,backward,17,1,1,3,1888,1887,4 -4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8 -4769,sum_45,call_function,sum.dim_IntList,backward,17,1,1,1,4291,1879,5 -4770,div_50,call_function,div.Tensor,backward,17,1,1,1,1889,1879,6 -4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8 -4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10 -4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8 -4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8 -4775,sum_46,call_function,sum.dim_IntList,backward,17,1,1,1,4288,3,5 -4776,convert_element_type_1178,call_function,convert_element_type.default,backward,17,1,1,1,4296,1875,6 -4777,convert_element_type_1179,call_function,convert_element_type.default,backward,17,1,1,1,4289,2,3 -4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10 -4779,dtype_cast_355,call_function,dtype_cast.default,backward,17,1,1,1,4290,1,3 -4780,alias_default_1407,call_function,alias.default,backward,17,1,1,0,4291,0,2 -4781,alias_default_973,call_function,alias.default,unknown,,1,1,3,4298,1873,4 -4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5 -4783,permute_667,call_function,permute.default,backward,16,1,1,1,4,1869,3 -4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5 -4785,permute_668,call_function,permute.default,backward,16,1,1,1,4300,2,4 -4786,dtype_cast_356,call_function,dtype_cast.default,backward,16,1,1,1,4301,1,4 -4787,alias_default_1396,call_function,alias.default,backward,16,1,1,0,4302,0,3 -4788,alias_default_974,call_function,alias.default,backward,16,1,1,2,4301,1867,4 -4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8 -4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8 -4791,alias_default_975,call_function,alias.default,backward,16,1,1,2,4303,1854,4 -4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5 -4793,permute_671,call_function,permute.default,backward,16,1,1,1,4,1850,3 -4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5 -4795,permute_672,call_function,permute.default,backward,16,1,1,1,4305,2,4 -4796,dtype_cast_357,call_function,dtype_cast.default,backward,16,1,1,1,4306,1,4 -4797,alias_default_1397,call_function,alias.default,backward,16,1,1,0,4307,0,3 -4798,convert_element_type_1188,call_function,convert_element_type.default,backward,16,1,1,1,4303,1858,6 -4799,convert_element_type_1189,call_function,convert_element_type.default,backward,16,1,1,1,1856,1868,4 -4800,alias_default_976,call_function,alias.default,backward,16,1,1,2,1857,1867,4 -4801,neg_39,call_function,neg.default,backward,16,1,1,1,1858,1866,8 -4802,exp_39,call_function,exp.default,backward,16,1,1,1,1859,1865,6 -4803,add_218,call_function,add.Tensor,backward,16,1,1,1,1860,1864,4 -4804,reciprocal_11,call_function,reciprocal.default,backward,16,1,1,1,1861,1863,4 -4805,mul_426,call_function,mul.Tensor,backward,16,1,1,1,1862,1862,6 -4806,alias_default_977,call_function,alias.default,backward,16,1,1,2,1863,1861,4 -4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8 -4808,sub_34,call_function,sub.Tensor,backward,16,1,1,1,1864,1859,4 -4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8 -4810,add_219,call_function,add.Tensor,backward,16,1,1,1,1866,1857,4 -4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8 -4812,convert_element_type_1190,call_function,convert_element_type.default,backward,16,1,1,1,4317,1855,6 -4813,alias_default_978,call_function,alias.default,backward,16,1,1,2,4318,1854,4 -4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5 -4815,permute_675,call_function,permute.default,backward,16,1,1,1,4,1850,3 -4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5 -4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10 -4818,permute_676,call_function,permute.default,backward,16,1,1,1,4320,2,4 -4819,dtype_cast_358,call_function,dtype_cast.default,backward,16,1,1,1,4321,1,4 -4820,alias_default_1395,call_function,alias.default,backward,16,1,1,0,4322,0,3 -4821,convert_element_type_1195,call_function,convert_element_type.default,backward,16,1,1,1,4326,1847,8 -4822,convert_element_type_1196,call_function,convert_element_type.default,backward,16,1,1,1,1836,1847,4 -4823,convert_element_type_1197,call_function,convert_element_type.default,backward,16,1,1,1,3,1841,2 -4824,alias_default_979,call_function,alias.default,backward,16,1,1,2,4327,1846,4 -4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8 -4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8 -4827,alias_default_980,call_function,alias.default,backward,16,1,1,2,4330,1839,4 -4828,alias_default_981,call_function,alias.default,backward,16,1,1,3,1845,1845,4 -4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8 -4830,sum_47,call_function,sum.dim_IntList,backward,16,1,1,1,4335,1837,5 -4831,div_51,call_function,div.Tensor,backward,16,1,1,1,1846,1837,6 -4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8 -4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10 -4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8 -4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8 -4836,sum_48,call_function,sum.dim_IntList,backward,16,1,1,1,4332,3,5 -4837,convert_element_type_1198,call_function,convert_element_type.default,backward,16,1,1,1,4340,1833,6 -4838,convert_element_type_1199,call_function,convert_element_type.default,backward,16,1,1,1,4333,2,3 -4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10 -4840,dtype_cast_359,call_function,dtype_cast.default,backward,16,1,1,1,4334,1,3 -4841,alias_default_1399,call_function,alias.default,backward,16,1,1,0,4335,0,2 -4842,alias_default_982,call_function,alias.default,unknown,,1,1,3,4342,1831,4 -4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5 -4844,permute_679,call_function,permute.default,backward,16,1,1,1,4,1827,3 -4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5 -4846,permute_680,call_function,permute.default,backward,16,1,1,1,4344,2,4 -4847,dtype_cast_360,call_function,dtype_cast.default,backward,16,1,1,1,4345,1,4 -4848,alias_default_1394,call_function,alias.default,backward,16,1,1,0,4346,0,3 -4849,view_898,call_function,view.default,backward,16,1,1,1,4345,1825,4 -4850,permute_681,call_function,permute.default,backward,16,1,1,1,4346,1824,4 -4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2 -4852,getitem_285,call_function,getitem,backward,16,1,1,1,4351,1796,2 -4853,getitem_286,call_function,getitem,backward,16,1,1,1,4351,1797,2 -4854,getitem_287,call_function,getitem,backward,16,1,1,1,4351,1790,2 -4855,permute_682,call_function,permute.default,backward,16,1,1,1,4352,1789,2 -4856,permute_683,call_function,permute.default,backward,16,1,1,1,4352,1796,2 -4857,permute_684,call_function,permute.default,backward,16,1,1,1,4352,1795,2 -4858,convert_element_type_1204,call_function,convert_element_type.default,backward,16,1,1,1,4353,1795,2 -4859,convert_element_type_1205,call_function,convert_element_type.default,backward,16,1,1,1,4353,1794,2 -4860,view_899,call_function,view.default,backward,16,1,1,1,4354,1794,2 -4861,view_as_complex_78,call_function,view_as_complex.default,backward,16,1,1,1,4355,1793,6 -4862,_conj_22,call_function,_conj.default,backward,16,1,1,1,4,1794,3 -4863,clone_94,call_function,clone.default,backward,16,1,1,1,5,1793,3 -4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8 -4865,view_900,call_function,view.default,backward,16,1,1,1,4354,1793,2 -4866,view_as_complex_79,call_function,view_as_complex.default,backward,16,1,1,1,4355,1792,6 -4867,_conj_23,call_function,_conj.default,backward,16,1,1,1,4,1793,3 -4868,clone_95,call_function,clone.default,backward,16,1,1,1,5,1792,3 -4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8 -4870,view_as_real_78,call_function,view_as_real.default,backward,16,1,1,1,4359,1791,6 -4871,view_901,call_function,view.default,backward,16,1,1,1,4360,1790,6 -4872,convert_element_type_1206,call_function,convert_element_type.default,backward,16,1,1,1,4361,1789,6 -4873,view_as_real_79,call_function,view_as_real.default,backward,16,1,1,1,4359,1790,6 -4874,view_902,call_function,view.default,backward,16,1,1,1,4360,1789,6 -4875,convert_element_type_1207,call_function,convert_element_type.default,backward,16,1,1,1,4361,1788,6 -4876,view_903,call_function,view.default,backward,16,1,1,1,4353,1788,2 -4877,view_904,call_function,view.default,backward,16,1,1,1,4362,1788,5 -4878,view_905,call_function,view.default,backward,16,1,1,1,4362,1787,5 -4879,alias_default_983,call_function,alias.default,backward,16,1,1,2,4354,1787,4 -4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5 -4881,permute_687,call_function,permute.default,backward,16,1,1,1,4,1783,3 -4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5 -4883,permute_688,call_function,permute.default,backward,16,1,1,1,4356,2,4 -4884,dtype_cast_361,call_function,dtype_cast.default,backward,16,1,1,1,4357,1,4 -4885,alias_default_1393,call_function,alias.default,backward,16,1,1,0,4358,0,3 -4886,alias_default_984,call_function,alias.default,backward,16,1,1,2,4363,1787,4 -4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5 -4888,permute_691,call_function,permute.default,backward,16,1,1,1,4,1783,3 -4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5 -4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10 -4891,permute_692,call_function,permute.default,backward,16,1,1,1,4365,2,4 -4892,dtype_cast_362,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4 -4893,alias_default_1392,call_function,alias.default,backward,16,1,1,0,4367,0,3 -4894,alias_default_985,call_function,alias.default,backward,16,1,1,2,4363,1786,4 -4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5 -4896,permute_695,call_function,permute.default,backward,16,1,1,1,4,1782,3 -4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5 -4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10 -4899,permute_696,call_function,permute.default,backward,16,1,1,1,4365,2,4 -4900,dtype_cast_363,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4 -4901,alias_default_1391,call_function,alias.default,backward,16,1,1,0,4367,0,3 -4902,convert_element_type_1220,call_function,convert_element_type.default,backward,16,1,1,1,4389,1779,8 -4903,convert_element_type_1221,call_function,convert_element_type.default,backward,16,1,1,1,1769,1779,4 -4904,convert_element_type_1222,call_function,convert_element_type.default,backward,16,1,1,1,3,1773,2 -4905,alias_default_986,call_function,alias.default,backward,16,1,1,2,4390,1778,4 -4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8 -4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8 -4908,alias_default_987,call_function,alias.default,backward,16,1,1,2,4393,1771,4 -4909,alias_default_988,call_function,alias.default,backward,16,1,1,3,1778,1777,4 -4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8 -4911,sum_49,call_function,sum.dim_IntList,backward,16,1,1,1,4398,1769,5 -4912,div_52,call_function,div.Tensor,backward,16,1,1,1,1779,1769,6 -4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8 -4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10 -4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8 -4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8 -4917,sum_50,call_function,sum.dim_IntList,backward,16,1,1,1,4395,3,5 -4918,convert_element_type_1223,call_function,convert_element_type.default,backward,16,1,1,1,4403,1765,6 -4919,convert_element_type_1224,call_function,convert_element_type.default,backward,16,1,1,1,4396,2,3 -4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10 -4921,dtype_cast_364,call_function,dtype_cast.default,backward,16,1,1,1,4397,1,3 -4922,alias_default_1398,call_function,alias.default,backward,16,1,1,0,4398,0,2 -4923,alias_default_989,call_function,alias.default,unknown,,1,1,3,4405,1763,4 -4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5 -4925,permute_699,call_function,permute.default,backward,15,1,1,1,4,1759,3 -4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5 -4927,permute_700,call_function,permute.default,backward,15,1,1,1,4407,2,4 -4928,dtype_cast_365,call_function,dtype_cast.default,backward,15,1,1,1,4408,1,4 -4929,alias_default_1387,call_function,alias.default,backward,15,1,1,0,4409,0,3 -4930,alias_default_990,call_function,alias.default,backward,15,1,1,2,4408,1757,4 -4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8 -4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8 -4933,alias_default_991,call_function,alias.default,backward,15,1,1,2,4410,1744,4 -4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5 -4935,permute_703,call_function,permute.default,backward,15,1,1,1,4,1740,3 -4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5 -4937,permute_704,call_function,permute.default,backward,15,1,1,1,4412,2,4 -4938,dtype_cast_366,call_function,dtype_cast.default,backward,15,1,1,1,4413,1,4 -4939,alias_default_1388,call_function,alias.default,backward,15,1,1,0,4414,0,3 -4940,convert_element_type_1233,call_function,convert_element_type.default,backward,15,1,1,1,4410,1748,6 -4941,convert_element_type_1234,call_function,convert_element_type.default,backward,15,1,1,1,1746,1758,4 -4942,alias_default_992,call_function,alias.default,backward,15,1,1,2,1747,1757,4 -4943,neg_40,call_function,neg.default,backward,15,1,1,1,1748,1756,8 -4944,exp_40,call_function,exp.default,backward,15,1,1,1,1749,1755,6 -4945,add_225,call_function,add.Tensor,backward,15,1,1,1,1750,1754,4 -4946,reciprocal_12,call_function,reciprocal.default,backward,15,1,1,1,1751,1753,4 -4947,mul_446,call_function,mul.Tensor,backward,15,1,1,1,1752,1752,6 -4948,alias_default_993,call_function,alias.default,backward,15,1,1,2,1753,1751,4 -4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8 -4950,sub_37,call_function,sub.Tensor,backward,15,1,1,1,1754,1749,4 -4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8 -4952,add_226,call_function,add.Tensor,backward,15,1,1,1,1756,1747,4 -4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8 -4954,convert_element_type_1235,call_function,convert_element_type.default,backward,15,1,1,1,4424,1745,6 -4955,alias_default_994,call_function,alias.default,backward,15,1,1,2,4425,1744,4 -4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5 -4957,permute_707,call_function,permute.default,backward,15,1,1,1,4,1740,3 -4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5 -4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10 -4960,permute_708,call_function,permute.default,backward,15,1,1,1,4427,2,4 -4961,dtype_cast_367,call_function,dtype_cast.default,backward,15,1,1,1,4428,1,4 -4962,alias_default_1386,call_function,alias.default,backward,15,1,1,0,4429,0,3 -4963,convert_element_type_1240,call_function,convert_element_type.default,backward,15,1,1,1,4433,1737,8 -4964,convert_element_type_1241,call_function,convert_element_type.default,backward,15,1,1,1,1726,1737,4 -4965,convert_element_type_1242,call_function,convert_element_type.default,backward,15,1,1,1,3,1731,2 -4966,alias_default_995,call_function,alias.default,backward,15,1,1,2,4434,1736,4 -4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8 -4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8 -4969,alias_default_996,call_function,alias.default,backward,15,1,1,2,4437,1729,4 -4970,alias_default_997,call_function,alias.default,backward,15,1,1,3,1735,1735,4 -4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8 -4972,sum_51,call_function,sum.dim_IntList,backward,15,1,1,1,4442,1727,5 -4973,div_53,call_function,div.Tensor,backward,15,1,1,1,1736,1727,6 -4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8 -4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10 -4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8 -4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8 -4978,sum_52,call_function,sum.dim_IntList,backward,15,1,1,1,4439,3,5 -4979,convert_element_type_1243,call_function,convert_element_type.default,backward,15,1,1,1,4447,1723,6 -4980,convert_element_type_1244,call_function,convert_element_type.default,backward,15,1,1,1,4440,2,3 -4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10 -4982,dtype_cast_368,call_function,dtype_cast.default,backward,15,1,1,1,4441,1,3 -4983,alias_default_1390,call_function,alias.default,backward,15,1,1,0,4442,0,2 -4984,alias_default_998,call_function,alias.default,unknown,,1,1,3,4449,1721,4 -4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5 -4986,permute_711,call_function,permute.default,backward,15,1,1,1,4,1717,3 -4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5 -4988,permute_712,call_function,permute.default,backward,15,1,1,1,4451,2,4 -4989,dtype_cast_369,call_function,dtype_cast.default,backward,15,1,1,1,4452,1,4 -4990,alias_default_1385,call_function,alias.default,backward,15,1,1,0,4453,0,3 -4991,view_920,call_function,view.default,backward,15,1,1,1,4452,1715,4 -4992,permute_713,call_function,permute.default,backward,15,1,1,1,4453,1714,4 -4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2 -4994,getitem_288,call_function,getitem,backward,15,1,1,1,4458,1686,2 -4995,getitem_289,call_function,getitem,backward,15,1,1,1,4458,1687,2 -4996,getitem_290,call_function,getitem,backward,15,1,1,1,4458,1680,2 -4997,permute_714,call_function,permute.default,backward,15,1,1,1,4459,1679,2 -4998,permute_715,call_function,permute.default,backward,15,1,1,1,4459,1686,2 -4999,permute_716,call_function,permute.default,backward,15,1,1,1,4459,1685,2 -5000,convert_element_type_1249,call_function,convert_element_type.default,backward,15,1,1,1,4460,1685,2 -5001,convert_element_type_1250,call_function,convert_element_type.default,backward,15,1,1,1,4460,1684,2 -5002,view_921,call_function,view.default,backward,15,1,1,1,4461,1684,2 -5003,view_as_complex_80,call_function,view_as_complex.default,backward,15,1,1,1,4462,1683,6 -5004,_conj_24,call_function,_conj.default,backward,15,1,1,1,4,1684,3 -5005,clone_102,call_function,clone.default,backward,15,1,1,1,5,1683,3 -5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8 -5007,view_922,call_function,view.default,backward,15,1,1,1,4461,1683,2 -5008,view_as_complex_81,call_function,view_as_complex.default,backward,15,1,1,1,4462,1682,6 -5009,_conj_25,call_function,_conj.default,backward,15,1,1,1,4,1683,3 -5010,clone_103,call_function,clone.default,backward,15,1,1,1,5,1682,3 -5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8 -5012,view_as_real_80,call_function,view_as_real.default,backward,15,1,1,1,4466,1681,6 -5013,view_923,call_function,view.default,backward,15,1,1,1,4467,1680,6 -5014,convert_element_type_1251,call_function,convert_element_type.default,backward,15,1,1,1,4468,1679,6 -5015,view_as_real_81,call_function,view_as_real.default,backward,15,1,1,1,4466,1680,6 -5016,view_924,call_function,view.default,backward,15,1,1,1,4467,1679,6 -5017,convert_element_type_1252,call_function,convert_element_type.default,backward,15,1,1,1,4468,1678,6 -5018,view_925,call_function,view.default,backward,15,1,1,1,4460,1678,2 -5019,view_926,call_function,view.default,backward,15,1,1,1,4469,1678,5 -5020,view_927,call_function,view.default,backward,15,1,1,1,4469,1677,5 -5021,alias_default_999,call_function,alias.default,backward,15,1,1,2,4461,1677,4 -5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5 -5023,permute_719,call_function,permute.default,backward,15,1,1,1,4,1673,3 -5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5 -5025,permute_720,call_function,permute.default,backward,15,1,1,1,4463,2,4 -5026,dtype_cast_370,call_function,dtype_cast.default,backward,15,1,1,1,4464,1,4 -5027,alias_default_1384,call_function,alias.default,backward,15,1,1,0,4465,0,3 -5028,alias_default_1000,call_function,alias.default,backward,15,1,1,2,4470,1677,4 -5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5 -5030,permute_723,call_function,permute.default,backward,15,1,1,1,4,1673,3 -5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5 -5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10 -5033,permute_724,call_function,permute.default,backward,15,1,1,1,4472,2,4 -5034,dtype_cast_371,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4 -5035,alias_default_1383,call_function,alias.default,backward,15,1,1,0,4474,0,3 -5036,alias_default_1001,call_function,alias.default,backward,15,1,1,2,4470,1676,4 -5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5 -5038,permute_727,call_function,permute.default,backward,15,1,1,1,4,1672,3 -5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5 -5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10 -5041,permute_728,call_function,permute.default,backward,15,1,1,1,4472,2,4 -5042,dtype_cast_372,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4 -5043,alias_default_1382,call_function,alias.default,backward,15,1,1,0,4474,0,3 -5044,convert_element_type_1265,call_function,convert_element_type.default,backward,15,1,1,1,4496,1669,8 -5045,convert_element_type_1266,call_function,convert_element_type.default,backward,15,1,1,1,1659,1669,4 -5046,convert_element_type_1267,call_function,convert_element_type.default,backward,15,1,1,1,3,1663,2 -5047,alias_default_1002,call_function,alias.default,backward,15,1,1,2,4497,1668,4 -5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8 -5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8 -5050,alias_default_1003,call_function,alias.default,backward,15,1,1,2,4500,1661,4 -5051,alias_default_1004,call_function,alias.default,backward,15,1,1,3,1668,1667,4 -5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8 -5053,sum_53,call_function,sum.dim_IntList,backward,15,1,1,1,4505,1659,5 -5054,div_54,call_function,div.Tensor,backward,15,1,1,1,1669,1659,6 -5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8 -5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10 -5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8 -5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8 -5059,sum_54,call_function,sum.dim_IntList,backward,15,1,1,1,4502,3,5 -5060,convert_element_type_1268,call_function,convert_element_type.default,backward,15,1,1,1,4510,1655,6 -5061,convert_element_type_1269,call_function,convert_element_type.default,backward,15,1,1,1,4503,2,3 -5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10 -5063,dtype_cast_373,call_function,dtype_cast.default,backward,15,1,1,1,4504,1,3 -5064,alias_default_1389,call_function,alias.default,backward,15,1,1,0,4505,0,2 -5065,alias_default_1005,call_function,alias.default,unknown,,1,1,3,4512,1653,4 -5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5 -5067,permute_731,call_function,permute.default,backward,14,1,1,1,4,1649,3 -5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5 -5069,permute_732,call_function,permute.default,backward,14,1,1,1,4514,2,4 -5070,dtype_cast_374,call_function,dtype_cast.default,backward,14,1,1,1,4515,1,4 -5071,alias_default_1378,call_function,alias.default,backward,14,1,1,0,4516,0,3 -5072,alias_default_1006,call_function,alias.default,backward,14,1,1,2,4515,1647,4 -5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8 -5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8 -5075,alias_default_1007,call_function,alias.default,backward,14,1,1,2,4517,1634,4 -5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5 -5077,permute_735,call_function,permute.default,backward,14,1,1,1,4,1630,3 -5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5 -5079,permute_736,call_function,permute.default,backward,14,1,1,1,4519,2,4 -5080,dtype_cast_375,call_function,dtype_cast.default,backward,14,1,1,1,4520,1,4 -5081,alias_default_1379,call_function,alias.default,backward,14,1,1,0,4521,0,3 -5082,convert_element_type_1278,call_function,convert_element_type.default,backward,14,1,1,1,4517,1638,6 -5083,convert_element_type_1279,call_function,convert_element_type.default,backward,14,1,1,1,1636,1648,4 -5084,alias_default_1008,call_function,alias.default,backward,14,1,1,2,1637,1647,4 -5085,neg_41,call_function,neg.default,backward,14,1,1,1,1638,1646,8 -5086,exp_41,call_function,exp.default,backward,14,1,1,1,1639,1645,6 -5087,add_232,call_function,add.Tensor,backward,14,1,1,1,1640,1644,4 -5088,reciprocal_13,call_function,reciprocal.default,backward,14,1,1,1,1641,1643,4 -5089,mul_466,call_function,mul.Tensor,backward,14,1,1,1,1642,1642,6 -5090,alias_default_1009,call_function,alias.default,backward,14,1,1,2,1643,1641,4 -5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8 -5092,sub_40,call_function,sub.Tensor,backward,14,1,1,1,1644,1639,4 -5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8 -5094,add_233,call_function,add.Tensor,backward,14,1,1,1,1646,1637,4 -5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8 -5096,convert_element_type_1280,call_function,convert_element_type.default,backward,14,1,1,1,4531,1635,6 -5097,alias_default_1010,call_function,alias.default,backward,14,1,1,2,4532,1634,4 -5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5 -5099,permute_739,call_function,permute.default,backward,14,1,1,1,4,1630,3 -5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5 -5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10 -5102,permute_740,call_function,permute.default,backward,14,1,1,1,4534,2,4 -5103,dtype_cast_376,call_function,dtype_cast.default,backward,14,1,1,1,4535,1,4 -5104,alias_default_1377,call_function,alias.default,backward,14,1,1,0,4536,0,3 -5105,convert_element_type_1285,call_function,convert_element_type.default,backward,14,1,1,1,4540,1627,8 -5106,convert_element_type_1286,call_function,convert_element_type.default,backward,14,1,1,1,1616,1627,4 -5107,convert_element_type_1287,call_function,convert_element_type.default,backward,14,1,1,1,3,1621,2 -5108,alias_default_1011,call_function,alias.default,backward,14,1,1,2,4541,1626,4 -5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8 -5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8 -5111,alias_default_1012,call_function,alias.default,backward,14,1,1,2,4544,1619,4 -5112,alias_default_1013,call_function,alias.default,backward,14,1,1,3,1625,1625,4 -5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8 -5114,sum_55,call_function,sum.dim_IntList,backward,14,1,1,1,4549,1617,5 -5115,div_55,call_function,div.Tensor,backward,14,1,1,1,1626,1617,6 -5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8 -5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10 -5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8 -5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8 -5120,sum_56,call_function,sum.dim_IntList,backward,14,1,1,1,4546,3,5 -5121,convert_element_type_1288,call_function,convert_element_type.default,backward,14,1,1,1,4554,1613,6 -5122,convert_element_type_1289,call_function,convert_element_type.default,backward,14,1,1,1,4547,2,3 -5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10 -5124,dtype_cast_377,call_function,dtype_cast.default,backward,14,1,1,1,4548,1,3 -5125,alias_default_1381,call_function,alias.default,backward,14,1,1,0,4549,0,2 -5126,alias_default_1014,call_function,alias.default,unknown,,1,1,3,4556,1611,4 -5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5 -5128,permute_743,call_function,permute.default,backward,14,1,1,1,4,1607,3 -5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5 -5130,permute_744,call_function,permute.default,backward,14,1,1,1,4558,2,4 -5131,dtype_cast_378,call_function,dtype_cast.default,backward,14,1,1,1,4559,1,4 -5132,alias_default_1376,call_function,alias.default,backward,14,1,1,0,4560,0,3 -5133,view_942,call_function,view.default,backward,14,1,1,1,4559,1605,4 -5134,permute_745,call_function,permute.default,backward,14,1,1,1,4560,1604,4 -5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2 -5136,getitem_291,call_function,getitem,backward,14,1,1,1,4565,1576,2 -5137,getitem_292,call_function,getitem,backward,14,1,1,1,4565,1577,2 -5138,getitem_293,call_function,getitem,backward,14,1,1,1,4565,1570,2 -5139,permute_746,call_function,permute.default,backward,14,1,1,1,4566,1569,2 -5140,permute_747,call_function,permute.default,backward,14,1,1,1,4566,1576,2 -5141,permute_748,call_function,permute.default,backward,14,1,1,1,4566,1575,2 -5142,convert_element_type_1294,call_function,convert_element_type.default,backward,14,1,1,1,4567,1575,2 -5143,convert_element_type_1295,call_function,convert_element_type.default,backward,14,1,1,1,4567,1574,2 -5144,view_943,call_function,view.default,backward,14,1,1,1,4568,1574,2 -5145,view_as_complex_82,call_function,view_as_complex.default,backward,14,1,1,1,4569,1573,6 -5146,_conj_26,call_function,_conj.default,backward,14,1,1,1,4,1574,3 -5147,clone_110,call_function,clone.default,backward,14,1,1,1,5,1573,3 -5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8 -5149,view_944,call_function,view.default,backward,14,1,1,1,4568,1573,2 -5150,view_as_complex_83,call_function,view_as_complex.default,backward,14,1,1,1,4569,1572,6 -5151,_conj_27,call_function,_conj.default,backward,14,1,1,1,4,1573,3 -5152,clone_111,call_function,clone.default,backward,14,1,1,1,5,1572,3 -5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8 -5154,view_as_real_82,call_function,view_as_real.default,backward,14,1,1,1,4573,1571,6 -5155,view_945,call_function,view.default,backward,14,1,1,1,4574,1570,6 -5156,convert_element_type_1296,call_function,convert_element_type.default,backward,14,1,1,1,4575,1569,6 -5157,view_as_real_83,call_function,view_as_real.default,backward,14,1,1,1,4573,1570,6 -5158,view_946,call_function,view.default,backward,14,1,1,1,4574,1569,6 -5159,convert_element_type_1297,call_function,convert_element_type.default,backward,14,1,1,1,4575,1568,6 -5160,view_947,call_function,view.default,backward,14,1,1,1,4567,1568,2 -5161,view_948,call_function,view.default,backward,14,1,1,1,4576,1568,5 -5162,view_949,call_function,view.default,backward,14,1,1,1,4576,1567,5 -5163,alias_default_1015,call_function,alias.default,backward,14,1,1,2,4568,1567,4 -5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5 -5165,permute_751,call_function,permute.default,backward,14,1,1,1,4,1563,3 -5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5 -5167,permute_752,call_function,permute.default,backward,14,1,1,1,4570,2,4 -5168,dtype_cast_379,call_function,dtype_cast.default,backward,14,1,1,1,4571,1,4 -5169,alias_default_1375,call_function,alias.default,backward,14,1,1,0,4572,0,3 -5170,alias_default_1016,call_function,alias.default,backward,14,1,1,2,4577,1567,4 -5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5 -5172,permute_755,call_function,permute.default,backward,14,1,1,1,4,1563,3 -5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5 -5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10 -5175,permute_756,call_function,permute.default,backward,14,1,1,1,4579,2,4 -5176,dtype_cast_380,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4 -5177,alias_default_1374,call_function,alias.default,backward,14,1,1,0,4581,0,3 -5178,alias_default_1017,call_function,alias.default,backward,14,1,1,2,4577,1566,4 -5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5 -5180,permute_759,call_function,permute.default,backward,14,1,1,1,4,1562,3 -5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5 -5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10 -5183,permute_760,call_function,permute.default,backward,14,1,1,1,4579,2,4 -5184,dtype_cast_381,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4 -5185,alias_default_1373,call_function,alias.default,backward,14,1,1,0,4581,0,3 -5186,convert_element_type_1310,call_function,convert_element_type.default,backward,14,1,1,1,4603,1559,8 -5187,convert_element_type_1311,call_function,convert_element_type.default,backward,14,1,1,1,1549,1559,4 -5188,convert_element_type_1312,call_function,convert_element_type.default,backward,14,1,1,1,3,1553,2 -5189,alias_default_1018,call_function,alias.default,backward,14,1,1,2,4604,1558,4 -5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8 -5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8 -5192,alias_default_1019,call_function,alias.default,backward,14,1,1,2,4607,1551,4 -5193,alias_default_1020,call_function,alias.default,backward,14,1,1,3,1558,1557,4 -5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8 -5195,sum_57,call_function,sum.dim_IntList,backward,14,1,1,1,4612,1549,5 -5196,div_56,call_function,div.Tensor,backward,14,1,1,1,1559,1549,6 -5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8 -5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10 -5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8 -5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8 -5201,sum_58,call_function,sum.dim_IntList,backward,14,1,1,1,4609,3,5 -5202,convert_element_type_1313,call_function,convert_element_type.default,backward,14,1,1,1,4617,1545,6 -5203,convert_element_type_1314,call_function,convert_element_type.default,backward,14,1,1,1,4610,2,3 -5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10 -5205,dtype_cast_382,call_function,dtype_cast.default,backward,14,1,1,1,4611,1,3 -5206,alias_default_1380,call_function,alias.default,backward,14,1,1,0,4612,0,2 -5207,alias_default_1021,call_function,alias.default,unknown,,1,1,3,4619,1543,4 -5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5 -5209,permute_763,call_function,permute.default,backward,13,1,1,1,4,1539,3 -5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5 -5211,permute_764,call_function,permute.default,backward,13,1,1,1,4621,2,4 -5212,dtype_cast_383,call_function,dtype_cast.default,backward,13,1,1,1,4622,1,4 -5213,alias_default_1369,call_function,alias.default,backward,13,1,1,0,4623,0,3 -5214,alias_default_1022,call_function,alias.default,backward,13,1,1,2,4622,1537,4 -5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8 -5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8 -5217,alias_default_1023,call_function,alias.default,backward,13,1,1,2,4624,1524,4 -5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5 -5219,permute_767,call_function,permute.default,backward,13,1,1,1,4,1520,3 -5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5 -5221,permute_768,call_function,permute.default,backward,13,1,1,1,4626,2,4 -5222,dtype_cast_384,call_function,dtype_cast.default,backward,13,1,1,1,4627,1,4 -5223,alias_default_1370,call_function,alias.default,backward,13,1,1,0,4628,0,3 -5224,convert_element_type_1323,call_function,convert_element_type.default,backward,13,1,1,1,4624,1528,6 -5225,convert_element_type_1324,call_function,convert_element_type.default,backward,13,1,1,1,1526,1538,4 -5226,alias_default_1024,call_function,alias.default,backward,13,1,1,2,1527,1537,4 -5227,neg_42,call_function,neg.default,backward,13,1,1,1,1528,1536,8 -5228,exp_42,call_function,exp.default,backward,13,1,1,1,1529,1535,6 -5229,add_239,call_function,add.Tensor,backward,13,1,1,1,1530,1534,4 -5230,reciprocal_14,call_function,reciprocal.default,backward,13,1,1,1,1531,1533,4 -5231,mul_486,call_function,mul.Tensor,backward,13,1,1,1,1532,1532,6 -5232,alias_default_1025,call_function,alias.default,backward,13,1,1,2,1533,1531,4 -5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8 -5234,sub_43,call_function,sub.Tensor,backward,13,1,1,1,1534,1529,4 -5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8 -5236,add_240,call_function,add.Tensor,backward,13,1,1,1,1536,1527,4 -5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8 -5238,convert_element_type_1325,call_function,convert_element_type.default,backward,13,1,1,1,4638,1525,6 -5239,alias_default_1026,call_function,alias.default,backward,13,1,1,2,4639,1524,4 -5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5 -5241,permute_771,call_function,permute.default,backward,13,1,1,1,4,1520,3 -5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5 -5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10 -5244,permute_772,call_function,permute.default,backward,13,1,1,1,4641,2,4 -5245,dtype_cast_385,call_function,dtype_cast.default,backward,13,1,1,1,4642,1,4 -5246,alias_default_1368,call_function,alias.default,backward,13,1,1,0,4643,0,3 -5247,convert_element_type_1330,call_function,convert_element_type.default,backward,13,1,1,1,4647,1517,8 -5248,convert_element_type_1331,call_function,convert_element_type.default,backward,13,1,1,1,1506,1517,4 -5249,convert_element_type_1332,call_function,convert_element_type.default,backward,13,1,1,1,3,1511,2 -5250,alias_default_1027,call_function,alias.default,backward,13,1,1,2,4648,1516,4 -5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8 -5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8 -5253,alias_default_1028,call_function,alias.default,backward,13,1,1,2,4651,1509,4 -5254,alias_default_1029,call_function,alias.default,backward,13,1,1,3,1515,1515,4 -5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8 -5256,sum_59,call_function,sum.dim_IntList,backward,13,1,1,1,4656,1507,5 -5257,div_57,call_function,div.Tensor,backward,13,1,1,1,1516,1507,6 -5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8 -5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10 -5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8 -5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8 -5262,sum_60,call_function,sum.dim_IntList,backward,13,1,1,1,4653,3,5 -5263,convert_element_type_1333,call_function,convert_element_type.default,backward,13,1,1,1,4661,1503,6 -5264,convert_element_type_1334,call_function,convert_element_type.default,backward,13,1,1,1,4654,2,3 -5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10 -5266,dtype_cast_386,call_function,dtype_cast.default,backward,13,1,1,1,4655,1,3 -5267,alias_default_1372,call_function,alias.default,backward,13,1,1,0,4656,0,2 -5268,alias_default_1030,call_function,alias.default,unknown,,1,1,3,4663,1501,4 -5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5 -5270,permute_775,call_function,permute.default,backward,13,1,1,1,4,1497,3 -5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5 -5272,permute_776,call_function,permute.default,backward,13,1,1,1,4665,2,4 -5273,dtype_cast_387,call_function,dtype_cast.default,backward,13,1,1,1,4666,1,4 -5274,alias_default_1367,call_function,alias.default,backward,13,1,1,0,4667,0,3 -5275,view_964,call_function,view.default,backward,13,1,1,1,4666,1495,4 -5276,permute_777,call_function,permute.default,backward,13,1,1,1,4667,1494,4 -5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2 -5278,getitem_294,call_function,getitem,backward,13,1,1,1,4672,1466,2 -5279,getitem_295,call_function,getitem,backward,13,1,1,1,4672,1467,2 -5280,getitem_296,call_function,getitem,backward,13,1,1,1,4672,1460,2 -5281,permute_778,call_function,permute.default,backward,13,1,1,1,4673,1459,2 -5282,permute_779,call_function,permute.default,backward,13,1,1,1,4673,1466,2 -5283,permute_780,call_function,permute.default,backward,13,1,1,1,4673,1465,2 -5284,convert_element_type_1339,call_function,convert_element_type.default,backward,13,1,1,1,4674,1465,2 -5285,convert_element_type_1340,call_function,convert_element_type.default,backward,13,1,1,1,4674,1464,2 -5286,view_965,call_function,view.default,backward,13,1,1,1,4675,1464,2 -5287,view_as_complex_84,call_function,view_as_complex.default,backward,13,1,1,1,4676,1463,6 -5288,_conj_28,call_function,_conj.default,backward,13,1,1,1,4,1464,3 -5289,clone_118,call_function,clone.default,backward,13,1,1,1,5,1463,3 -5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8 -5291,view_966,call_function,view.default,backward,13,1,1,1,4675,1463,2 -5292,view_as_complex_85,call_function,view_as_complex.default,backward,13,1,1,1,4676,1462,6 -5293,_conj_29,call_function,_conj.default,backward,13,1,1,1,4,1463,3 -5294,clone_119,call_function,clone.default,backward,13,1,1,1,5,1462,3 -5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8 -5296,view_as_real_84,call_function,view_as_real.default,backward,13,1,1,1,4680,1461,6 -5297,view_967,call_function,view.default,backward,13,1,1,1,4681,1460,6 -5298,convert_element_type_1341,call_function,convert_element_type.default,backward,13,1,1,1,4682,1459,6 -5299,view_as_real_85,call_function,view_as_real.default,backward,13,1,1,1,4680,1460,6 -5300,view_968,call_function,view.default,backward,13,1,1,1,4681,1459,6 -5301,convert_element_type_1342,call_function,convert_element_type.default,backward,13,1,1,1,4682,1458,6 -5302,view_969,call_function,view.default,backward,13,1,1,1,4674,1458,2 -5303,view_970,call_function,view.default,backward,13,1,1,1,4683,1458,5 -5304,view_971,call_function,view.default,backward,13,1,1,1,4683,1457,5 -5305,alias_default_1031,call_function,alias.default,backward,13,1,1,2,4675,1457,4 -5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5 -5307,permute_783,call_function,permute.default,backward,13,1,1,1,4,1453,3 -5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5 -5309,permute_784,call_function,permute.default,backward,13,1,1,1,4677,2,4 -5310,dtype_cast_388,call_function,dtype_cast.default,backward,13,1,1,1,4678,1,4 -5311,alias_default_1366,call_function,alias.default,backward,13,1,1,0,4679,0,3 -5312,alias_default_1032,call_function,alias.default,backward,13,1,1,2,4684,1457,4 -5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5 -5314,permute_787,call_function,permute.default,backward,13,1,1,1,4,1453,3 -5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5 -5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10 -5317,permute_788,call_function,permute.default,backward,13,1,1,1,4686,2,4 -5318,dtype_cast_389,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4 -5319,alias_default_1365,call_function,alias.default,backward,13,1,1,0,4688,0,3 -5320,alias_default_1033,call_function,alias.default,backward,13,1,1,2,4684,1456,4 -5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5 -5322,permute_791,call_function,permute.default,backward,13,1,1,1,4,1452,3 -5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5 -5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10 -5325,permute_792,call_function,permute.default,backward,13,1,1,1,4686,2,4 -5326,dtype_cast_390,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4 -5327,alias_default_1364,call_function,alias.default,backward,13,1,1,0,4688,0,3 -5328,convert_element_type_1355,call_function,convert_element_type.default,backward,13,1,1,1,4710,1449,8 -5329,convert_element_type_1356,call_function,convert_element_type.default,backward,13,1,1,1,1439,1449,4 -5330,convert_element_type_1357,call_function,convert_element_type.default,backward,13,1,1,1,3,1443,2 -5331,alias_default_1034,call_function,alias.default,backward,13,1,1,2,4711,1448,4 -5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8 -5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8 -5334,alias_default_1035,call_function,alias.default,backward,13,1,1,2,4714,1441,4 -5335,alias_default_1036,call_function,alias.default,backward,13,1,1,3,1448,1447,4 -5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8 -5337,sum_61,call_function,sum.dim_IntList,backward,13,1,1,1,4719,1439,5 -5338,div_58,call_function,div.Tensor,backward,13,1,1,1,1449,1439,6 -5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8 -5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10 -5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8 -5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8 -5343,sum_62,call_function,sum.dim_IntList,backward,13,1,1,1,4716,3,5 -5344,convert_element_type_1358,call_function,convert_element_type.default,backward,13,1,1,1,4724,1435,6 -5345,convert_element_type_1359,call_function,convert_element_type.default,backward,13,1,1,1,4717,2,3 -5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10 -5347,dtype_cast_391,call_function,dtype_cast.default,backward,13,1,1,1,4718,1,3 -5348,alias_default_1371,call_function,alias.default,backward,13,1,1,0,4719,0,2 -5349,alias_default_1037,call_function,alias.default,unknown,,1,1,3,4726,1433,4 -5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5 -5351,permute_795,call_function,permute.default,backward,12,1,1,1,4,1429,3 -5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5 -5353,permute_796,call_function,permute.default,backward,12,1,1,1,4728,2,4 -5354,dtype_cast_392,call_function,dtype_cast.default,backward,12,1,1,1,4729,1,4 -5355,alias_default_1360,call_function,alias.default,backward,12,1,1,0,4730,0,3 -5356,alias_default_1038,call_function,alias.default,backward,12,1,1,2,4729,1427,4 -5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8 -5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8 -5359,alias_default_1039,call_function,alias.default,backward,12,1,1,2,4731,1414,4 -5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5 -5361,permute_799,call_function,permute.default,backward,12,1,1,1,4,1410,3 -5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5 -5363,permute_800,call_function,permute.default,backward,12,1,1,1,4733,2,4 -5364,dtype_cast_393,call_function,dtype_cast.default,backward,12,1,1,1,4734,1,4 -5365,alias_default_1361,call_function,alias.default,backward,12,1,1,0,4735,0,3 -5366,convert_element_type_1368,call_function,convert_element_type.default,backward,12,1,1,1,4731,1418,6 -5367,convert_element_type_1369,call_function,convert_element_type.default,backward,12,1,1,1,1416,1428,4 -5368,alias_default_1040,call_function,alias.default,backward,12,1,1,2,1417,1427,4 -5369,neg_43,call_function,neg.default,backward,12,1,1,1,1418,1426,8 -5370,exp_43,call_function,exp.default,backward,12,1,1,1,1419,1425,6 -5371,add_246,call_function,add.Tensor,backward,12,1,1,1,1420,1424,4 -5372,reciprocal_15,call_function,reciprocal.default,backward,12,1,1,1,1421,1423,4 -5373,mul_506,call_function,mul.Tensor,backward,12,1,1,1,1422,1422,6 -5374,alias_default_1041,call_function,alias.default,backward,12,1,1,2,1423,1421,4 -5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8 -5376,sub_46,call_function,sub.Tensor,backward,12,1,1,1,1424,1419,4 -5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8 -5378,add_247,call_function,add.Tensor,backward,12,1,1,1,1426,1417,4 -5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8 -5380,convert_element_type_1370,call_function,convert_element_type.default,backward,12,1,1,1,4745,1415,6 -5381,alias_default_1042,call_function,alias.default,backward,12,1,1,2,4746,1414,4 -5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5 -5383,permute_803,call_function,permute.default,backward,12,1,1,1,4,1410,3 -5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5 -5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10 -5386,permute_804,call_function,permute.default,backward,12,1,1,1,4748,2,4 -5387,dtype_cast_394,call_function,dtype_cast.default,backward,12,1,1,1,4749,1,4 -5388,alias_default_1359,call_function,alias.default,backward,12,1,1,0,4750,0,3 -5389,convert_element_type_1375,call_function,convert_element_type.default,backward,12,1,1,1,4754,1407,8 -5390,convert_element_type_1376,call_function,convert_element_type.default,backward,12,1,1,1,1396,1407,4 -5391,convert_element_type_1377,call_function,convert_element_type.default,backward,12,1,1,1,3,1401,2 -5392,alias_default_1043,call_function,alias.default,backward,12,1,1,2,4755,1406,4 -5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8 -5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8 -5395,alias_default_1044,call_function,alias.default,backward,12,1,1,2,4758,1399,4 -5396,alias_default_1045,call_function,alias.default,backward,12,1,1,3,1405,1405,4 -5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8 -5398,sum_63,call_function,sum.dim_IntList,backward,12,1,1,1,4763,1397,5 -5399,div_59,call_function,div.Tensor,backward,12,1,1,1,1406,1397,6 -5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8 -5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10 -5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8 -5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8 -5404,sum_64,call_function,sum.dim_IntList,backward,12,1,1,1,4760,3,5 -5405,convert_element_type_1378,call_function,convert_element_type.default,backward,12,1,1,1,4768,1393,6 -5406,convert_element_type_1379,call_function,convert_element_type.default,backward,12,1,1,1,4761,2,3 -5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10 -5408,dtype_cast_395,call_function,dtype_cast.default,backward,12,1,1,1,4762,1,3 -5409,alias_default_1363,call_function,alias.default,backward,12,1,1,0,4763,0,2 -5410,alias_default_1046,call_function,alias.default,unknown,,1,1,3,4770,1391,4 -5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5 -5412,permute_807,call_function,permute.default,backward,12,1,1,1,4,1387,3 -5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5 -5414,permute_808,call_function,permute.default,backward,12,1,1,1,4772,2,4 -5415,dtype_cast_396,call_function,dtype_cast.default,backward,12,1,1,1,4773,1,4 -5416,alias_default_1358,call_function,alias.default,backward,12,1,1,0,4774,0,3 -5417,view_986,call_function,view.default,backward,12,1,1,1,4773,1385,4 -5418,permute_809,call_function,permute.default,backward,12,1,1,1,4774,1384,4 -5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2 -5420,getitem_297,call_function,getitem,backward,12,1,1,1,4779,1356,2 -5421,getitem_298,call_function,getitem,backward,12,1,1,1,4779,1357,2 -5422,getitem_299,call_function,getitem,backward,12,1,1,1,4779,1350,2 -5423,permute_810,call_function,permute.default,backward,12,1,1,1,4780,1349,2 -5424,permute_811,call_function,permute.default,backward,12,1,1,1,4780,1356,2 -5425,permute_812,call_function,permute.default,backward,12,1,1,1,4780,1355,2 -5426,convert_element_type_1384,call_function,convert_element_type.default,backward,12,1,1,1,4781,1355,2 -5427,convert_element_type_1385,call_function,convert_element_type.default,backward,12,1,1,1,4781,1354,2 -5428,view_987,call_function,view.default,backward,12,1,1,1,4782,1354,2 -5429,view_as_complex_86,call_function,view_as_complex.default,backward,12,1,1,1,4783,1353,6 -5430,_conj_30,call_function,_conj.default,backward,12,1,1,1,4,1354,3 -5431,clone_126,call_function,clone.default,backward,12,1,1,1,5,1353,3 -5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8 -5433,view_988,call_function,view.default,backward,12,1,1,1,4782,1353,2 -5434,view_as_complex_87,call_function,view_as_complex.default,backward,12,1,1,1,4783,1352,6 -5435,_conj_31,call_function,_conj.default,backward,12,1,1,1,4,1353,3 -5436,clone_127,call_function,clone.default,backward,12,1,1,1,5,1352,3 -5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8 -5438,view_as_real_86,call_function,view_as_real.default,backward,12,1,1,1,4787,1351,6 -5439,view_989,call_function,view.default,backward,12,1,1,1,4788,1350,6 -5440,convert_element_type_1386,call_function,convert_element_type.default,backward,12,1,1,1,4789,1349,6 -5441,view_as_real_87,call_function,view_as_real.default,backward,12,1,1,1,4787,1350,6 -5442,view_990,call_function,view.default,backward,12,1,1,1,4788,1349,6 -5443,convert_element_type_1387,call_function,convert_element_type.default,backward,12,1,1,1,4789,1348,6 -5444,view_991,call_function,view.default,backward,12,1,1,1,4781,1348,2 -5445,view_992,call_function,view.default,backward,12,1,1,1,4790,1348,5 -5446,view_993,call_function,view.default,backward,12,1,1,1,4790,1347,5 -5447,alias_default_1047,call_function,alias.default,backward,12,1,1,2,4782,1347,4 -5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5 -5449,permute_815,call_function,permute.default,backward,12,1,1,1,4,1343,3 -5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5 -5451,permute_816,call_function,permute.default,backward,12,1,1,1,4784,2,4 -5452,dtype_cast_397,call_function,dtype_cast.default,backward,12,1,1,1,4785,1,4 -5453,alias_default_1357,call_function,alias.default,backward,12,1,1,0,4786,0,3 -5454,alias_default_1048,call_function,alias.default,backward,12,1,1,2,4791,1347,4 -5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5 -5456,permute_819,call_function,permute.default,backward,12,1,1,1,4,1343,3 -5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5 -5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10 -5459,permute_820,call_function,permute.default,backward,12,1,1,1,4793,2,4 -5460,dtype_cast_398,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4 -5461,alias_default_1356,call_function,alias.default,backward,12,1,1,0,4795,0,3 -5462,alias_default_1049,call_function,alias.default,backward,12,1,1,2,4791,1346,4 -5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5 -5464,permute_823,call_function,permute.default,backward,12,1,1,1,4,1342,3 -5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5 -5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10 -5467,permute_824,call_function,permute.default,backward,12,1,1,1,4793,2,4 -5468,dtype_cast_399,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4 -5469,alias_default_1355,call_function,alias.default,backward,12,1,1,0,4795,0,3 -5470,convert_element_type_1400,call_function,convert_element_type.default,backward,12,1,1,1,4817,1339,8 -5471,convert_element_type_1401,call_function,convert_element_type.default,backward,12,1,1,1,1329,1339,4 -5472,convert_element_type_1402,call_function,convert_element_type.default,backward,12,1,1,1,3,1333,2 -5473,alias_default_1050,call_function,alias.default,backward,12,1,1,2,4818,1338,4 -5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8 -5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8 -5476,alias_default_1051,call_function,alias.default,backward,12,1,1,2,4821,1331,4 -5477,alias_default_1052,call_function,alias.default,backward,12,1,1,3,1338,1337,4 -5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8 -5479,sum_65,call_function,sum.dim_IntList,backward,12,1,1,1,4826,1329,5 -5480,div_60,call_function,div.Tensor,backward,12,1,1,1,1339,1329,6 -5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8 -5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10 -5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8 -5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8 -5485,sum_66,call_function,sum.dim_IntList,backward,12,1,1,1,4823,3,5 -5486,convert_element_type_1403,call_function,convert_element_type.default,backward,12,1,1,1,4831,1325,6 -5487,convert_element_type_1404,call_function,convert_element_type.default,backward,12,1,1,1,4824,2,3 -5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10 -5489,dtype_cast_400,call_function,dtype_cast.default,backward,12,1,1,1,4825,1,3 -5490,alias_default_1362,call_function,alias.default,backward,12,1,1,0,4826,0,2 -5491,alias_default_1053,call_function,alias.default,unknown,,1,1,3,4833,1323,4 -5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5 -5493,permute_827,call_function,permute.default,backward,11,1,1,1,4,1319,3 -5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5 -5495,permute_828,call_function,permute.default,backward,11,1,1,1,4835,2,4 -5496,dtype_cast_401,call_function,dtype_cast.default,backward,11,1,1,1,4836,1,4 -5497,alias_default_1351,call_function,alias.default,backward,11,1,1,0,4837,0,3 -5498,alias_default_1054,call_function,alias.default,backward,11,1,1,2,4836,1317,4 -5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8 -5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8 -5501,alias_default_1055,call_function,alias.default,backward,11,1,1,2,4838,1304,4 -5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5 -5503,permute_831,call_function,permute.default,backward,11,1,1,1,4,1300,3 -5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5 -5505,permute_832,call_function,permute.default,backward,11,1,1,1,4840,2,4 -5506,dtype_cast_402,call_function,dtype_cast.default,backward,11,1,1,1,4841,1,4 -5507,alias_default_1352,call_function,alias.default,backward,11,1,1,0,4842,0,3 -5508,convert_element_type_1413,call_function,convert_element_type.default,backward,11,1,1,1,4838,1308,6 -5509,convert_element_type_1414,call_function,convert_element_type.default,backward,11,1,1,1,1306,1318,4 -5510,alias_default_1056,call_function,alias.default,backward,11,1,1,2,1307,1317,4 -5511,neg_44,call_function,neg.default,backward,11,1,1,1,1308,1316,8 -5512,exp_44,call_function,exp.default,backward,11,1,1,1,1309,1315,6 -5513,add_253,call_function,add.Tensor,backward,11,1,1,1,1310,1314,4 -5514,reciprocal_16,call_function,reciprocal.default,backward,11,1,1,1,1311,1313,4 -5515,mul_526,call_function,mul.Tensor,backward,11,1,1,1,1312,1312,6 -5516,alias_default_1057,call_function,alias.default,backward,11,1,1,2,1313,1311,4 -5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8 -5518,sub_49,call_function,sub.Tensor,backward,11,1,1,1,1314,1309,4 -5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8 -5520,add_254,call_function,add.Tensor,backward,11,1,1,1,1316,1307,4 -5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8 -5522,convert_element_type_1415,call_function,convert_element_type.default,backward,11,1,1,1,4852,1305,6 -5523,alias_default_1058,call_function,alias.default,backward,11,1,1,2,4853,1304,4 -5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5 -5525,permute_835,call_function,permute.default,backward,11,1,1,1,4,1300,3 -5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5 -5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10 -5528,permute_836,call_function,permute.default,backward,11,1,1,1,4855,2,4 -5529,dtype_cast_403,call_function,dtype_cast.default,backward,11,1,1,1,4856,1,4 -5530,alias_default_1350,call_function,alias.default,backward,11,1,1,0,4857,0,3 -5531,convert_element_type_1420,call_function,convert_element_type.default,backward,11,1,1,1,4861,1297,8 -5532,convert_element_type_1421,call_function,convert_element_type.default,backward,11,1,1,1,1286,1297,4 -5533,convert_element_type_1422,call_function,convert_element_type.default,backward,11,1,1,1,3,1291,2 -5534,alias_default_1059,call_function,alias.default,backward,11,1,1,2,4862,1296,4 -5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8 -5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8 -5537,alias_default_1060,call_function,alias.default,backward,11,1,1,2,4865,1289,4 -5538,alias_default_1061,call_function,alias.default,backward,11,1,1,3,1295,1295,4 -5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8 -5540,sum_67,call_function,sum.dim_IntList,backward,11,1,1,1,4870,1287,5 -5541,div_61,call_function,div.Tensor,backward,11,1,1,1,1296,1287,6 -5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8 -5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10 -5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8 -5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8 -5546,sum_68,call_function,sum.dim_IntList,backward,11,1,1,1,4867,3,5 -5547,convert_element_type_1423,call_function,convert_element_type.default,backward,11,1,1,1,4875,1283,6 -5548,convert_element_type_1424,call_function,convert_element_type.default,backward,11,1,1,1,4868,2,3 -5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10 -5550,dtype_cast_404,call_function,dtype_cast.default,backward,11,1,1,1,4869,1,3 -5551,alias_default_1354,call_function,alias.default,backward,11,1,1,0,4870,0,2 -5552,alias_default_1062,call_function,alias.default,unknown,,1,1,3,4877,1281,4 -5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5 -5554,permute_839,call_function,permute.default,backward,11,1,1,1,4,1277,3 -5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5 -5556,permute_840,call_function,permute.default,backward,11,1,1,1,4879,2,4 -5557,dtype_cast_405,call_function,dtype_cast.default,backward,11,1,1,1,4880,1,4 -5558,alias_default_1349,call_function,alias.default,backward,11,1,1,0,4881,0,3 -5559,view_1008,call_function,view.default,backward,11,1,1,1,4880,1275,4 -5560,permute_841,call_function,permute.default,backward,11,1,1,1,4881,1274,4 -5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2 -5562,getitem_300,call_function,getitem,backward,11,1,1,1,4886,1246,2 -5563,getitem_301,call_function,getitem,backward,11,1,1,1,4886,1247,2 -5564,getitem_302,call_function,getitem,backward,11,1,1,1,4886,1240,2 -5565,permute_842,call_function,permute.default,backward,11,1,1,1,4887,1239,2 -5566,permute_843,call_function,permute.default,backward,11,1,1,1,4887,1246,2 -5567,permute_844,call_function,permute.default,backward,11,1,1,1,4887,1245,2 -5568,convert_element_type_1429,call_function,convert_element_type.default,backward,11,1,1,1,4888,1245,2 -5569,convert_element_type_1430,call_function,convert_element_type.default,backward,11,1,1,1,4888,1244,2 -5570,view_1009,call_function,view.default,backward,11,1,1,1,4889,1244,2 -5571,view_as_complex_88,call_function,view_as_complex.default,backward,11,1,1,1,4890,1243,6 -5572,_conj_32,call_function,_conj.default,backward,11,1,1,1,4,1244,3 -5573,clone_134,call_function,clone.default,backward,11,1,1,1,5,1243,3 -5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8 -5575,view_1010,call_function,view.default,backward,11,1,1,1,4889,1243,2 -5576,view_as_complex_89,call_function,view_as_complex.default,backward,11,1,1,1,4890,1242,6 -5577,_conj_33,call_function,_conj.default,backward,11,1,1,1,4,1243,3 -5578,clone_135,call_function,clone.default,backward,11,1,1,1,5,1242,3 -5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8 -5580,view_as_real_88,call_function,view_as_real.default,backward,11,1,1,1,4894,1241,6 -5581,view_1011,call_function,view.default,backward,11,1,1,1,4895,1240,6 -5582,convert_element_type_1431,call_function,convert_element_type.default,backward,11,1,1,1,4896,1239,6 -5583,view_as_real_89,call_function,view_as_real.default,backward,11,1,1,1,4894,1240,6 -5584,view_1012,call_function,view.default,backward,11,1,1,1,4895,1239,6 -5585,convert_element_type_1432,call_function,convert_element_type.default,backward,11,1,1,1,4896,1238,6 -5586,view_1013,call_function,view.default,backward,11,1,1,1,4888,1238,2 -5587,view_1014,call_function,view.default,backward,11,1,1,1,4897,1238,5 -5588,view_1015,call_function,view.default,backward,11,1,1,1,4897,1237,5 -5589,alias_default_1063,call_function,alias.default,backward,11,1,1,2,4889,1237,4 -5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5 -5591,permute_847,call_function,permute.default,backward,11,1,1,1,4,1233,3 -5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5 -5593,permute_848,call_function,permute.default,backward,11,1,1,1,4891,2,4 -5594,dtype_cast_406,call_function,dtype_cast.default,backward,11,1,1,1,4892,1,4 -5595,alias_default_1348,call_function,alias.default,backward,11,1,1,0,4893,0,3 -5596,alias_default_1064,call_function,alias.default,backward,11,1,1,2,4898,1237,4 -5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5 -5598,permute_851,call_function,permute.default,backward,11,1,1,1,4,1233,3 -5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5 -5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10 -5601,permute_852,call_function,permute.default,backward,11,1,1,1,4900,2,4 -5602,dtype_cast_407,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4 -5603,alias_default_1347,call_function,alias.default,backward,11,1,1,0,4902,0,3 -5604,alias_default_1065,call_function,alias.default,backward,11,1,1,2,4898,1236,4 -5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5 -5606,permute_855,call_function,permute.default,backward,11,1,1,1,4,1232,3 -5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5 -5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10 -5609,permute_856,call_function,permute.default,backward,11,1,1,1,4900,2,4 -5610,dtype_cast_408,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4 -5611,alias_default_1346,call_function,alias.default,backward,11,1,1,0,4902,0,3 -5612,convert_element_type_1445,call_function,convert_element_type.default,backward,11,1,1,1,4924,1229,8 -5613,convert_element_type_1446,call_function,convert_element_type.default,backward,11,1,1,1,1219,1229,4 -5614,convert_element_type_1447,call_function,convert_element_type.default,backward,11,1,1,1,3,1223,2 -5615,alias_default_1066,call_function,alias.default,backward,11,1,1,2,4925,1228,4 -5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8 -5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8 -5618,alias_default_1067,call_function,alias.default,backward,11,1,1,2,4928,1221,4 -5619,alias_default_1068,call_function,alias.default,backward,11,1,1,3,1228,1227,4 -5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8 -5621,sum_69,call_function,sum.dim_IntList,backward,11,1,1,1,4933,1219,5 -5622,div_62,call_function,div.Tensor,backward,11,1,1,1,1229,1219,6 -5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8 -5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10 -5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8 -5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8 -5627,sum_70,call_function,sum.dim_IntList,backward,11,1,1,1,4930,3,5 -5628,convert_element_type_1448,call_function,convert_element_type.default,backward,11,1,1,1,4938,1215,6 -5629,convert_element_type_1449,call_function,convert_element_type.default,backward,11,1,1,1,4931,2,3 -5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10 -5631,dtype_cast_409,call_function,dtype_cast.default,backward,11,1,1,1,4932,1,3 -5632,alias_default_1353,call_function,alias.default,backward,11,1,1,0,4933,0,2 -5633,alias_default_1069,call_function,alias.default,unknown,,1,1,3,4940,1213,4 -5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5 -5635,permute_859,call_function,permute.default,backward,10,1,1,1,4,1209,3 -5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5 -5637,permute_860,call_function,permute.default,backward,10,1,1,1,4942,2,4 -5638,dtype_cast_410,call_function,dtype_cast.default,backward,10,1,1,1,4943,1,4 -5639,alias_default_1342,call_function,alias.default,backward,10,1,1,0,4944,0,3 -5640,alias_default_1070,call_function,alias.default,backward,10,1,1,2,4943,1207,4 -5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8 -5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8 -5643,alias_default_1071,call_function,alias.default,backward,10,1,1,2,4945,1194,4 -5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5 -5645,permute_863,call_function,permute.default,backward,10,1,1,1,4,1190,3 -5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5 -5647,permute_864,call_function,permute.default,backward,10,1,1,1,4947,2,4 -5648,dtype_cast_411,call_function,dtype_cast.default,backward,10,1,1,1,4948,1,4 -5649,alias_default_1343,call_function,alias.default,backward,10,1,1,0,4949,0,3 -5650,convert_element_type_1458,call_function,convert_element_type.default,backward,10,1,1,1,4945,1198,6 -5651,convert_element_type_1459,call_function,convert_element_type.default,backward,10,1,1,1,1196,1208,4 -5652,alias_default_1072,call_function,alias.default,backward,10,1,1,2,1197,1207,4 -5653,neg_45,call_function,neg.default,backward,10,1,1,1,1198,1206,8 -5654,exp_45,call_function,exp.default,backward,10,1,1,1,1199,1205,6 -5655,add_260,call_function,add.Tensor,backward,10,1,1,1,1200,1204,4 -5656,reciprocal_17,call_function,reciprocal.default,backward,10,1,1,1,1201,1203,4 -5657,mul_546,call_function,mul.Tensor,backward,10,1,1,1,1202,1202,6 -5658,alias_default_1073,call_function,alias.default,backward,10,1,1,2,1203,1201,4 -5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8 -5660,sub_52,call_function,sub.Tensor,backward,10,1,1,1,1204,1199,4 -5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8 -5662,add_261,call_function,add.Tensor,backward,10,1,1,1,1206,1197,4 -5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8 -5664,convert_element_type_1460,call_function,convert_element_type.default,backward,10,1,1,1,4959,1195,6 -5665,alias_default_1074,call_function,alias.default,backward,10,1,1,2,4960,1194,4 -5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5 -5667,permute_867,call_function,permute.default,backward,10,1,1,1,4,1190,3 -5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5 -5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10 -5670,permute_868,call_function,permute.default,backward,10,1,1,1,4962,2,4 -5671,dtype_cast_412,call_function,dtype_cast.default,backward,10,1,1,1,4963,1,4 -5672,alias_default_1341,call_function,alias.default,backward,10,1,1,0,4964,0,3 -5673,convert_element_type_1465,call_function,convert_element_type.default,backward,10,1,1,1,4968,1187,8 -5674,convert_element_type_1466,call_function,convert_element_type.default,backward,10,1,1,1,1176,1187,4 -5675,convert_element_type_1467,call_function,convert_element_type.default,backward,10,1,1,1,3,1181,2 -5676,alias_default_1075,call_function,alias.default,backward,10,1,1,2,4969,1186,4 -5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8 -5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8 -5679,alias_default_1076,call_function,alias.default,backward,10,1,1,2,4972,1179,4 -5680,alias_default_1077,call_function,alias.default,backward,10,1,1,3,1185,1185,4 -5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8 -5682,sum_71,call_function,sum.dim_IntList,backward,10,1,1,1,4977,1177,5 -5683,div_63,call_function,div.Tensor,backward,10,1,1,1,1186,1177,6 -5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8 -5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10 -5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8 -5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8 -5688,sum_72,call_function,sum.dim_IntList,backward,10,1,1,1,4974,3,5 -5689,convert_element_type_1468,call_function,convert_element_type.default,backward,10,1,1,1,4982,1173,6 -5690,convert_element_type_1469,call_function,convert_element_type.default,backward,10,1,1,1,4975,2,3 -5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10 -5692,dtype_cast_413,call_function,dtype_cast.default,backward,10,1,1,1,4976,1,3 -5693,alias_default_1345,call_function,alias.default,backward,10,1,1,0,4977,0,2 -5694,alias_default_1078,call_function,alias.default,unknown,,1,1,3,4984,1171,4 -5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5 -5696,permute_871,call_function,permute.default,backward,10,1,1,1,4,1167,3 -5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5 -5698,permute_872,call_function,permute.default,backward,10,1,1,1,4986,2,4 -5699,dtype_cast_414,call_function,dtype_cast.default,backward,10,1,1,1,4987,1,4 -5700,alias_default_1340,call_function,alias.default,backward,10,1,1,0,4988,0,3 -5701,view_1030,call_function,view.default,backward,10,1,1,1,4987,1165,4 -5702,permute_873,call_function,permute.default,backward,10,1,1,1,4988,1164,4 -5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2 -5704,getitem_303,call_function,getitem,backward,10,1,1,1,4993,1136,2 -5705,getitem_304,call_function,getitem,backward,10,1,1,1,4993,1137,2 -5706,getitem_305,call_function,getitem,backward,10,1,1,1,4993,1130,2 -5707,permute_874,call_function,permute.default,backward,10,1,1,1,4994,1129,2 -5708,permute_875,call_function,permute.default,backward,10,1,1,1,4994,1136,2 -5709,permute_876,call_function,permute.default,backward,10,1,1,1,4994,1135,2 -5710,convert_element_type_1474,call_function,convert_element_type.default,backward,10,1,1,1,4995,1135,2 -5711,convert_element_type_1475,call_function,convert_element_type.default,backward,10,1,1,1,4995,1134,2 -5712,view_1031,call_function,view.default,backward,10,1,1,1,4996,1134,2 -5713,view_as_complex_90,call_function,view_as_complex.default,backward,10,1,1,1,4997,1133,6 -5714,_conj_34,call_function,_conj.default,backward,10,1,1,1,4,1134,3 -5715,clone_142,call_function,clone.default,backward,10,1,1,1,5,1133,3 -5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8 -5717,view_1032,call_function,view.default,backward,10,1,1,1,4996,1133,2 -5718,view_as_complex_91,call_function,view_as_complex.default,backward,10,1,1,1,4997,1132,6 -5719,_conj_35,call_function,_conj.default,backward,10,1,1,1,4,1133,3 -5720,clone_143,call_function,clone.default,backward,10,1,1,1,5,1132,3 -5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8 -5722,view_as_real_90,call_function,view_as_real.default,backward,10,1,1,1,5001,1131,6 -5723,view_1033,call_function,view.default,backward,10,1,1,1,5002,1130,6 -5724,convert_element_type_1476,call_function,convert_element_type.default,backward,10,1,1,1,5003,1129,6 -5725,view_as_real_91,call_function,view_as_real.default,backward,10,1,1,1,5001,1130,6 -5726,view_1034,call_function,view.default,backward,10,1,1,1,5002,1129,6 -5727,convert_element_type_1477,call_function,convert_element_type.default,backward,10,1,1,1,5003,1128,6 -5728,view_1035,call_function,view.default,backward,10,1,1,1,4995,1128,2 -5729,view_1036,call_function,view.default,backward,10,1,1,1,5004,1128,5 -5730,view_1037,call_function,view.default,backward,10,1,1,1,5004,1127,5 -5731,alias_default_1079,call_function,alias.default,backward,10,1,1,2,4996,1127,4 -5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5 -5733,permute_879,call_function,permute.default,backward,10,1,1,1,4,1123,3 -5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5 -5735,permute_880,call_function,permute.default,backward,10,1,1,1,4998,2,4 -5736,dtype_cast_415,call_function,dtype_cast.default,backward,10,1,1,1,4999,1,4 -5737,alias_default_1339,call_function,alias.default,backward,10,1,1,0,5000,0,3 -5738,alias_default_1080,call_function,alias.default,backward,10,1,1,2,5005,1127,4 -5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5 -5740,permute_883,call_function,permute.default,backward,10,1,1,1,4,1123,3 -5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5 -5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10 -5743,permute_884,call_function,permute.default,backward,10,1,1,1,5007,2,4 -5744,dtype_cast_416,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4 -5745,alias_default_1338,call_function,alias.default,backward,10,1,1,0,5009,0,3 -5746,alias_default_1081,call_function,alias.default,backward,10,1,1,2,5005,1126,4 -5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5 -5748,permute_887,call_function,permute.default,backward,10,1,1,1,4,1122,3 -5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5 -5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10 -5751,permute_888,call_function,permute.default,backward,10,1,1,1,5007,2,4 -5752,dtype_cast_417,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4 -5753,alias_default_1337,call_function,alias.default,backward,10,1,1,0,5009,0,3 -5754,convert_element_type_1490,call_function,convert_element_type.default,backward,10,1,1,1,5031,1119,8 -5755,convert_element_type_1491,call_function,convert_element_type.default,backward,10,1,1,1,1109,1119,4 -5756,convert_element_type_1492,call_function,convert_element_type.default,backward,10,1,1,1,3,1113,2 -5757,alias_default_1082,call_function,alias.default,backward,10,1,1,2,5032,1118,4 -5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8 -5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8 -5760,alias_default_1083,call_function,alias.default,backward,10,1,1,2,5035,1111,4 -5761,alias_default_1084,call_function,alias.default,backward,10,1,1,3,1118,1117,4 -5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8 -5763,sum_73,call_function,sum.dim_IntList,backward,10,1,1,1,5040,1109,5 -5764,div_64,call_function,div.Tensor,backward,10,1,1,1,1119,1109,6 -5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8 -5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10 -5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8 -5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8 -5769,sum_74,call_function,sum.dim_IntList,backward,10,1,1,1,5037,3,5 -5770,convert_element_type_1493,call_function,convert_element_type.default,backward,10,1,1,1,5045,1105,6 -5771,convert_element_type_1494,call_function,convert_element_type.default,backward,10,1,1,1,5038,2,3 -5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10 -5773,dtype_cast_418,call_function,dtype_cast.default,backward,10,1,1,1,5039,1,3 -5774,alias_default_1344,call_function,alias.default,backward,10,1,1,0,5040,0,2 -5775,alias_default_1085,call_function,alias.default,unknown,,1,1,3,5047,1103,4 -5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5 -5777,permute_891,call_function,permute.default,backward,9,1,1,1,4,1099,3 -5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5 -5779,permute_892,call_function,permute.default,backward,9,1,1,1,5049,2,4 -5780,dtype_cast_419,call_function,dtype_cast.default,backward,9,1,1,1,5050,1,4 -5781,alias_default_1333,call_function,alias.default,backward,9,1,1,0,5051,0,3 -5782,alias_default_1086,call_function,alias.default,backward,9,1,1,2,5050,1097,4 -5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8 -5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8 -5785,alias_default_1087,call_function,alias.default,backward,9,1,1,2,5052,1084,4 -5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5 -5787,permute_895,call_function,permute.default,backward,9,1,1,1,4,1080,3 -5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5 -5789,permute_896,call_function,permute.default,backward,9,1,1,1,5054,2,4 -5790,dtype_cast_420,call_function,dtype_cast.default,backward,9,1,1,1,5055,1,4 -5791,alias_default_1334,call_function,alias.default,backward,9,1,1,0,5056,0,3 -5792,convert_element_type_1503,call_function,convert_element_type.default,backward,9,1,1,1,5052,1088,6 -5793,convert_element_type_1504,call_function,convert_element_type.default,backward,9,1,1,1,1086,1098,4 -5794,alias_default_1088,call_function,alias.default,backward,9,1,1,2,1087,1097,4 -5795,neg_46,call_function,neg.default,backward,9,1,1,1,1088,1096,8 -5796,exp_46,call_function,exp.default,backward,9,1,1,1,1089,1095,6 -5797,add_267,call_function,add.Tensor,backward,9,1,1,1,1090,1094,4 -5798,reciprocal_18,call_function,reciprocal.default,backward,9,1,1,1,1091,1093,4 -5799,mul_566,call_function,mul.Tensor,backward,9,1,1,1,1092,1092,6 -5800,alias_default_1089,call_function,alias.default,backward,9,1,1,2,1093,1091,4 -5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8 -5802,sub_55,call_function,sub.Tensor,backward,9,1,1,1,1094,1089,4 -5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8 -5804,add_268,call_function,add.Tensor,backward,9,1,1,1,1096,1087,4 -5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8 -5806,convert_element_type_1505,call_function,convert_element_type.default,backward,9,1,1,1,5066,1085,6 -5807,alias_default_1090,call_function,alias.default,backward,9,1,1,2,5067,1084,4 -5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5 -5809,permute_899,call_function,permute.default,backward,9,1,1,1,4,1080,3 -5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5 -5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10 -5812,permute_900,call_function,permute.default,backward,9,1,1,1,5069,2,4 -5813,dtype_cast_421,call_function,dtype_cast.default,backward,9,1,1,1,5070,1,4 -5814,alias_default_1332,call_function,alias.default,backward,9,1,1,0,5071,0,3 -5815,convert_element_type_1510,call_function,convert_element_type.default,backward,9,1,1,1,5075,1077,8 -5816,convert_element_type_1511,call_function,convert_element_type.default,backward,9,1,1,1,1066,1077,4 -5817,convert_element_type_1512,call_function,convert_element_type.default,backward,9,1,1,1,3,1071,2 -5818,alias_default_1091,call_function,alias.default,backward,9,1,1,2,5076,1076,4 -5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8 -5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8 -5821,alias_default_1092,call_function,alias.default,backward,9,1,1,2,5079,1069,4 -5822,alias_default_1093,call_function,alias.default,backward,9,1,1,3,1075,1075,4 -5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8 -5824,sum_75,call_function,sum.dim_IntList,backward,9,1,1,1,5084,1067,5 -5825,div_65,call_function,div.Tensor,backward,9,1,1,1,1076,1067,6 -5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8 -5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10 -5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8 -5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8 -5830,sum_76,call_function,sum.dim_IntList,backward,9,1,1,1,5081,3,5 -5831,convert_element_type_1513,call_function,convert_element_type.default,backward,9,1,1,1,5089,1063,6 -5832,convert_element_type_1514,call_function,convert_element_type.default,backward,9,1,1,1,5082,2,3 -5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10 -5834,dtype_cast_422,call_function,dtype_cast.default,backward,9,1,1,1,5083,1,3 -5835,alias_default_1336,call_function,alias.default,backward,9,1,1,0,5084,0,2 -5836,alias_default_1094,call_function,alias.default,unknown,,1,1,3,5091,1061,4 -5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5 -5838,permute_903,call_function,permute.default,backward,9,1,1,1,4,1057,3 -5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5 -5840,permute_904,call_function,permute.default,backward,9,1,1,1,5093,2,4 -5841,dtype_cast_423,call_function,dtype_cast.default,backward,9,1,1,1,5094,1,4 -5842,alias_default_1331,call_function,alias.default,backward,9,1,1,0,5095,0,3 -5843,view_1052,call_function,view.default,backward,9,1,1,1,5094,1055,4 -5844,permute_905,call_function,permute.default,backward,9,1,1,1,5095,1054,4 -5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2 -5846,getitem_306,call_function,getitem,backward,9,1,1,1,5100,1026,2 -5847,getitem_307,call_function,getitem,backward,9,1,1,1,5100,1027,2 -5848,getitem_308,call_function,getitem,backward,9,1,1,1,5100,1020,2 -5849,permute_906,call_function,permute.default,backward,9,1,1,1,5101,1019,2 -5850,permute_907,call_function,permute.default,backward,9,1,1,1,5101,1026,2 -5851,permute_908,call_function,permute.default,backward,9,1,1,1,5101,1025,2 -5852,convert_element_type_1519,call_function,convert_element_type.default,backward,9,1,1,1,5102,1025,2 -5853,convert_element_type_1520,call_function,convert_element_type.default,backward,9,1,1,1,5102,1024,2 -5854,view_1053,call_function,view.default,backward,9,1,1,1,5103,1024,2 -5855,view_as_complex_92,call_function,view_as_complex.default,backward,9,1,1,1,5104,1023,6 -5856,_conj_36,call_function,_conj.default,backward,9,1,1,1,4,1024,3 -5857,clone_150,call_function,clone.default,backward,9,1,1,1,5,1023,3 -5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8 -5859,view_1054,call_function,view.default,backward,9,1,1,1,5103,1023,2 -5860,view_as_complex_93,call_function,view_as_complex.default,backward,9,1,1,1,5104,1022,6 -5861,_conj_37,call_function,_conj.default,backward,9,1,1,1,4,1023,3 -5862,clone_151,call_function,clone.default,backward,9,1,1,1,5,1022,3 -5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8 -5864,view_as_real_92,call_function,view_as_real.default,backward,9,1,1,1,5108,1021,6 -5865,view_1055,call_function,view.default,backward,9,1,1,1,5109,1020,6 -5866,convert_element_type_1521,call_function,convert_element_type.default,backward,9,1,1,1,5110,1019,6 -5867,view_as_real_93,call_function,view_as_real.default,backward,9,1,1,1,5108,1020,6 -5868,view_1056,call_function,view.default,backward,9,1,1,1,5109,1019,6 -5869,convert_element_type_1522,call_function,convert_element_type.default,backward,9,1,1,1,5110,1018,6 -5870,view_1057,call_function,view.default,backward,9,1,1,1,5102,1018,2 -5871,view_1058,call_function,view.default,backward,9,1,1,1,5111,1018,5 -5872,view_1059,call_function,view.default,backward,9,1,1,1,5111,1017,5 -5873,alias_default_1095,call_function,alias.default,backward,9,1,1,2,5103,1017,4 -5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5 -5875,permute_911,call_function,permute.default,backward,9,1,1,1,4,1013,3 -5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5 -5877,permute_912,call_function,permute.default,backward,9,1,1,1,5105,2,4 -5878,dtype_cast_424,call_function,dtype_cast.default,backward,9,1,1,1,5106,1,4 -5879,alias_default_1330,call_function,alias.default,backward,9,1,1,0,5107,0,3 -5880,alias_default_1096,call_function,alias.default,backward,9,1,1,2,5112,1017,4 -5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5 -5882,permute_915,call_function,permute.default,backward,9,1,1,1,4,1013,3 -5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5 -5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10 -5885,permute_916,call_function,permute.default,backward,9,1,1,1,5114,2,4 -5886,dtype_cast_425,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4 -5887,alias_default_1329,call_function,alias.default,backward,9,1,1,0,5116,0,3 -5888,alias_default_1097,call_function,alias.default,backward,9,1,1,2,5112,1016,4 -5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5 -5890,permute_919,call_function,permute.default,backward,9,1,1,1,4,1012,3 -5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5 -5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10 -5893,permute_920,call_function,permute.default,backward,9,1,1,1,5114,2,4 -5894,dtype_cast_426,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4 -5895,alias_default_1328,call_function,alias.default,backward,9,1,1,0,5116,0,3 -5896,convert_element_type_1535,call_function,convert_element_type.default,backward,9,1,1,1,5138,1009,8 -5897,convert_element_type_1536,call_function,convert_element_type.default,backward,9,1,1,1,999,1009,4 -5898,convert_element_type_1537,call_function,convert_element_type.default,backward,9,1,1,1,3,1003,2 -5899,alias_default_1098,call_function,alias.default,backward,9,1,1,2,5139,1008,4 -5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8 -5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8 -5902,alias_default_1099,call_function,alias.default,backward,9,1,1,2,5142,1001,4 -5903,alias_default_1100,call_function,alias.default,backward,9,1,1,3,1008,1007,4 -5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8 -5905,sum_77,call_function,sum.dim_IntList,backward,9,1,1,1,5147,999,5 -5906,div_66,call_function,div.Tensor,backward,9,1,1,1,1009,999,6 -5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8 -5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10 -5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8 -5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8 -5911,sum_78,call_function,sum.dim_IntList,backward,9,1,1,1,5144,3,5 -5912,convert_element_type_1538,call_function,convert_element_type.default,backward,9,1,1,1,5152,995,6 -5913,convert_element_type_1539,call_function,convert_element_type.default,backward,9,1,1,1,5145,2,3 -5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10 -5915,dtype_cast_427,call_function,dtype_cast.default,backward,9,1,1,1,5146,1,3 -5916,alias_default_1335,call_function,alias.default,backward,9,1,1,0,5147,0,2 -5917,alias_default_1101,call_function,alias.default,unknown,,1,1,3,5154,993,4 -5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5 -5919,permute_923,call_function,permute.default,backward,8,1,1,1,4,989,3 -5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5 -5921,permute_924,call_function,permute.default,backward,8,1,1,1,5156,2,4 -5922,dtype_cast_428,call_function,dtype_cast.default,backward,8,1,1,1,5157,1,4 -5923,alias_default_1324,call_function,alias.default,backward,8,1,1,0,5158,0,3 -5924,alias_default_1102,call_function,alias.default,backward,8,1,1,2,5157,987,4 -5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8 -5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8 -5927,alias_default_1103,call_function,alias.default,backward,8,1,1,2,5159,974,4 -5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5 -5929,permute_927,call_function,permute.default,backward,8,1,1,1,4,970,3 -5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5 -5931,permute_928,call_function,permute.default,backward,8,1,1,1,5161,2,4 -5932,dtype_cast_429,call_function,dtype_cast.default,backward,8,1,1,1,5162,1,4 -5933,alias_default_1325,call_function,alias.default,backward,8,1,1,0,5163,0,3 -5934,convert_element_type_1548,call_function,convert_element_type.default,backward,8,1,1,1,5159,978,6 -5935,convert_element_type_1549,call_function,convert_element_type.default,backward,8,1,1,1,976,988,4 -5936,alias_default_1104,call_function,alias.default,backward,8,1,1,2,977,987,4 -5937,neg_47,call_function,neg.default,backward,8,1,1,1,978,986,8 -5938,exp_47,call_function,exp.default,backward,8,1,1,1,979,985,6 -5939,add_274,call_function,add.Tensor,backward,8,1,1,1,980,984,4 -5940,reciprocal_19,call_function,reciprocal.default,backward,8,1,1,1,981,983,4 -5941,mul_586,call_function,mul.Tensor,backward,8,1,1,1,982,982,6 -5942,alias_default_1105,call_function,alias.default,backward,8,1,1,2,983,981,4 -5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8 -5944,sub_58,call_function,sub.Tensor,backward,8,1,1,1,984,979,4 -5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8 -5946,add_275,call_function,add.Tensor,backward,8,1,1,1,986,977,4 -5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8 -5948,convert_element_type_1550,call_function,convert_element_type.default,backward,8,1,1,1,5173,975,6 -5949,alias_default_1106,call_function,alias.default,backward,8,1,1,2,5174,974,4 -5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5 -5951,permute_931,call_function,permute.default,backward,8,1,1,1,4,970,3 -5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5 -5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10 -5954,permute_932,call_function,permute.default,backward,8,1,1,1,5176,2,4 -5955,dtype_cast_430,call_function,dtype_cast.default,backward,8,1,1,1,5177,1,4 -5956,alias_default_1323,call_function,alias.default,backward,8,1,1,0,5178,0,3 -5957,convert_element_type_1555,call_function,convert_element_type.default,backward,8,1,1,1,5182,967,8 -5958,convert_element_type_1556,call_function,convert_element_type.default,backward,8,1,1,1,956,967,4 -5959,convert_element_type_1557,call_function,convert_element_type.default,backward,8,1,1,1,3,961,2 -5960,alias_default_1107,call_function,alias.default,backward,8,1,1,2,5183,966,4 -5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8 -5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8 -5963,alias_default_1108,call_function,alias.default,backward,8,1,1,2,5186,959,4 -5964,alias_default_1109,call_function,alias.default,backward,8,1,1,3,965,965,4 -5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8 -5966,sum_79,call_function,sum.dim_IntList,backward,8,1,1,1,5191,957,5 -5967,div_67,call_function,div.Tensor,backward,8,1,1,1,966,957,6 -5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8 -5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10 -5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8 -5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8 -5972,sum_80,call_function,sum.dim_IntList,backward,8,1,1,1,5188,3,5 -5973,convert_element_type_1558,call_function,convert_element_type.default,backward,8,1,1,1,5196,953,6 -5974,convert_element_type_1559,call_function,convert_element_type.default,backward,8,1,1,1,5189,2,3 -5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10 -5976,dtype_cast_431,call_function,dtype_cast.default,backward,8,1,1,1,5190,1,3 -5977,alias_default_1327,call_function,alias.default,backward,8,1,1,0,5191,0,2 -5978,alias_default_1110,call_function,alias.default,unknown,,1,1,3,5198,951,4 -5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5 -5980,permute_935,call_function,permute.default,backward,8,1,1,1,4,947,3 -5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5 -5982,permute_936,call_function,permute.default,backward,8,1,1,1,5200,2,4 -5983,dtype_cast_432,call_function,dtype_cast.default,backward,8,1,1,1,5201,1,4 -5984,alias_default_1322,call_function,alias.default,backward,8,1,1,0,5202,0,3 -5985,view_1074,call_function,view.default,backward,8,1,1,1,5201,945,4 -5986,permute_937,call_function,permute.default,backward,8,1,1,1,5202,944,4 -5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2 -5988,getitem_309,call_function,getitem,backward,8,1,1,1,5207,916,2 -5989,getitem_310,call_function,getitem,backward,8,1,1,1,5207,917,2 -5990,getitem_311,call_function,getitem,backward,8,1,1,1,5207,910,2 -5991,permute_938,call_function,permute.default,backward,8,1,1,1,5208,909,2 -5992,permute_939,call_function,permute.default,backward,8,1,1,1,5208,916,2 -5993,permute_940,call_function,permute.default,backward,8,1,1,1,5208,915,2 -5994,convert_element_type_1564,call_function,convert_element_type.default,backward,8,1,1,1,5209,915,2 -5995,convert_element_type_1565,call_function,convert_element_type.default,backward,8,1,1,1,5209,914,2 -5996,view_1075,call_function,view.default,backward,8,1,1,1,5210,914,2 -5997,view_as_complex_94,call_function,view_as_complex.default,backward,8,1,1,1,5211,913,6 -5998,_conj_38,call_function,_conj.default,backward,8,1,1,1,4,914,3 -5999,clone_158,call_function,clone.default,backward,8,1,1,1,5,913,3 -6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8 -6001,view_1076,call_function,view.default,backward,8,1,1,1,5210,913,2 -6002,view_as_complex_95,call_function,view_as_complex.default,backward,8,1,1,1,5211,912,6 -6003,_conj_39,call_function,_conj.default,backward,8,1,1,1,4,913,3 -6004,clone_159,call_function,clone.default,backward,8,1,1,1,5,912,3 -6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8 -6006,view_as_real_94,call_function,view_as_real.default,backward,8,1,1,1,5215,911,6 -6007,view_1077,call_function,view.default,backward,8,1,1,1,5216,910,6 -6008,convert_element_type_1566,call_function,convert_element_type.default,backward,8,1,1,1,5217,909,6 -6009,view_as_real_95,call_function,view_as_real.default,backward,8,1,1,1,5215,910,6 -6010,view_1078,call_function,view.default,backward,8,1,1,1,5216,909,6 -6011,convert_element_type_1567,call_function,convert_element_type.default,backward,8,1,1,1,5217,908,6 -6012,view_1079,call_function,view.default,backward,8,1,1,1,5209,908,2 -6013,view_1080,call_function,view.default,backward,8,1,1,1,5218,908,5 -6014,view_1081,call_function,view.default,backward,8,1,1,1,5218,907,5 -6015,alias_default_1111,call_function,alias.default,backward,8,1,1,2,5210,907,4 -6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5 -6017,permute_943,call_function,permute.default,backward,8,1,1,1,4,903,3 -6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5 -6019,permute_944,call_function,permute.default,backward,8,1,1,1,5212,2,4 -6020,dtype_cast_433,call_function,dtype_cast.default,backward,8,1,1,1,5213,1,4 -6021,alias_default_1321,call_function,alias.default,backward,8,1,1,0,5214,0,3 -6022,alias_default_1112,call_function,alias.default,backward,8,1,1,2,5219,907,4 -6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5 -6024,permute_947,call_function,permute.default,backward,8,1,1,1,4,903,3 -6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5 -6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10 -6027,permute_948,call_function,permute.default,backward,8,1,1,1,5221,2,4 -6028,dtype_cast_434,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4 -6029,alias_default_1320,call_function,alias.default,backward,8,1,1,0,5223,0,3 -6030,alias_default_1113,call_function,alias.default,backward,8,1,1,2,5219,906,4 -6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5 -6032,permute_951,call_function,permute.default,backward,8,1,1,1,4,902,3 -6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5 -6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10 -6035,permute_952,call_function,permute.default,backward,8,1,1,1,5221,2,4 -6036,dtype_cast_435,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4 -6037,alias_default_1319,call_function,alias.default,backward,8,1,1,0,5223,0,3 -6038,convert_element_type_1580,call_function,convert_element_type.default,backward,8,1,1,1,5245,899,8 -6039,convert_element_type_1581,call_function,convert_element_type.default,backward,8,1,1,1,889,899,4 -6040,convert_element_type_1582,call_function,convert_element_type.default,backward,8,1,1,1,3,893,2 -6041,alias_default_1114,call_function,alias.default,backward,8,1,1,2,5246,898,4 -6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8 -6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8 -6044,alias_default_1115,call_function,alias.default,backward,8,1,1,2,5249,891,4 -6045,alias_default_1116,call_function,alias.default,backward,8,1,1,3,898,897,4 -6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8 -6047,sum_81,call_function,sum.dim_IntList,backward,8,1,1,1,5254,889,5 -6048,div_68,call_function,div.Tensor,backward,8,1,1,1,899,889,6 -6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8 -6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10 -6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8 -6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8 -6053,sum_82,call_function,sum.dim_IntList,backward,8,1,1,1,5251,3,5 -6054,convert_element_type_1583,call_function,convert_element_type.default,backward,8,1,1,1,5259,885,6 -6055,convert_element_type_1584,call_function,convert_element_type.default,backward,8,1,1,1,5252,2,3 -6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10 -6057,dtype_cast_436,call_function,dtype_cast.default,backward,8,1,1,1,5253,1,3 -6058,alias_default_1326,call_function,alias.default,backward,8,1,1,0,5254,0,2 -6059,alias_default_1117,call_function,alias.default,unknown,,1,1,3,5261,883,4 -6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5 -6061,permute_955,call_function,permute.default,backward,7,1,1,1,4,879,3 -6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5 -6063,permute_956,call_function,permute.default,backward,7,1,1,1,5263,2,4 -6064,dtype_cast_437,call_function,dtype_cast.default,backward,7,1,1,1,5264,1,4 -6065,alias_default_1315,call_function,alias.default,backward,7,1,1,0,5265,0,3 -6066,alias_default_1118,call_function,alias.default,backward,7,1,1,2,5264,877,4 -6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8 -6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8 -6069,alias_default_1119,call_function,alias.default,backward,7,1,1,2,5266,864,4 -6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5 -6071,permute_959,call_function,permute.default,backward,7,1,1,1,4,860,3 -6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5 -6073,permute_960,call_function,permute.default,backward,7,1,1,1,5268,2,4 -6074,dtype_cast_438,call_function,dtype_cast.default,backward,7,1,1,1,5269,1,4 -6075,alias_default_1316,call_function,alias.default,backward,7,1,1,0,5270,0,3 -6076,convert_element_type_1593,call_function,convert_element_type.default,backward,7,1,1,1,5266,868,6 -6077,convert_element_type_1594,call_function,convert_element_type.default,backward,7,1,1,1,866,878,4 -6078,alias_default_1120,call_function,alias.default,backward,7,1,1,2,867,877,4 -6079,neg_48,call_function,neg.default,backward,7,1,1,1,868,876,8 -6080,exp_48,call_function,exp.default,backward,7,1,1,1,869,875,6 -6081,add_281,call_function,add.Tensor,backward,7,1,1,1,870,874,4 -6082,reciprocal_20,call_function,reciprocal.default,backward,7,1,1,1,871,873,4 -6083,mul_606,call_function,mul.Tensor,backward,7,1,1,1,872,872,6 -6084,alias_default_1121,call_function,alias.default,backward,7,1,1,2,873,871,4 -6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8 -6086,sub_61,call_function,sub.Tensor,backward,7,1,1,1,874,869,4 -6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8 -6088,add_282,call_function,add.Tensor,backward,7,1,1,1,876,867,4 -6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8 -6090,convert_element_type_1595,call_function,convert_element_type.default,backward,7,1,1,1,5280,865,6 -6091,alias_default_1122,call_function,alias.default,backward,7,1,1,2,5281,864,4 -6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5 -6093,permute_963,call_function,permute.default,backward,7,1,1,1,4,860,3 -6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5 -6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10 -6096,permute_964,call_function,permute.default,backward,7,1,1,1,5283,2,4 -6097,dtype_cast_439,call_function,dtype_cast.default,backward,7,1,1,1,5284,1,4 -6098,alias_default_1314,call_function,alias.default,backward,7,1,1,0,5285,0,3 -6099,convert_element_type_1600,call_function,convert_element_type.default,backward,7,1,1,1,5289,857,8 -6100,convert_element_type_1601,call_function,convert_element_type.default,backward,7,1,1,1,846,857,4 -6101,convert_element_type_1602,call_function,convert_element_type.default,backward,7,1,1,1,3,851,2 -6102,alias_default_1123,call_function,alias.default,backward,7,1,1,2,5290,856,4 -6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8 -6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8 -6105,alias_default_1124,call_function,alias.default,backward,7,1,1,2,5293,849,4 -6106,alias_default_1125,call_function,alias.default,backward,7,1,1,3,855,855,4 -6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8 -6108,sum_83,call_function,sum.dim_IntList,backward,7,1,1,1,5298,847,5 -6109,div_69,call_function,div.Tensor,backward,7,1,1,1,856,847,6 -6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8 -6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10 -6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8 -6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8 -6114,sum_84,call_function,sum.dim_IntList,backward,7,1,1,1,5295,3,5 -6115,convert_element_type_1603,call_function,convert_element_type.default,backward,7,1,1,1,5303,843,6 -6116,convert_element_type_1604,call_function,convert_element_type.default,backward,7,1,1,1,5296,2,3 -6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10 -6118,dtype_cast_440,call_function,dtype_cast.default,backward,7,1,1,1,5297,1,3 -6119,alias_default_1318,call_function,alias.default,backward,7,1,1,0,5298,0,2 -6120,alias_default_1126,call_function,alias.default,unknown,,1,1,3,5305,841,4 -6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5 -6122,permute_967,call_function,permute.default,backward,7,1,1,1,4,837,3 -6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5 -6124,permute_968,call_function,permute.default,backward,7,1,1,1,5307,2,4 -6125,dtype_cast_441,call_function,dtype_cast.default,backward,7,1,1,1,5308,1,4 -6126,alias_default_1313,call_function,alias.default,backward,7,1,1,0,5309,0,3 -6127,view_1096,call_function,view.default,backward,7,1,1,1,5308,835,4 -6128,permute_969,call_function,permute.default,backward,7,1,1,1,5309,834,4 -6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2 -6130,getitem_312,call_function,getitem,backward,7,1,1,1,5314,806,2 -6131,getitem_313,call_function,getitem,backward,7,1,1,1,5314,807,2 -6132,getitem_314,call_function,getitem,backward,7,1,1,1,5314,800,2 -6133,permute_970,call_function,permute.default,backward,7,1,1,1,5315,799,2 -6134,permute_971,call_function,permute.default,backward,7,1,1,1,5315,806,2 -6135,permute_972,call_function,permute.default,backward,7,1,1,1,5315,805,2 -6136,convert_element_type_1609,call_function,convert_element_type.default,backward,7,1,1,1,5316,805,2 -6137,convert_element_type_1610,call_function,convert_element_type.default,backward,7,1,1,1,5316,804,2 -6138,view_1097,call_function,view.default,backward,7,1,1,1,5317,804,2 -6139,view_as_complex_96,call_function,view_as_complex.default,backward,7,1,1,1,5318,803,6 -6140,_conj_40,call_function,_conj.default,backward,7,1,1,1,4,804,3 -6141,clone_166,call_function,clone.default,backward,7,1,1,1,5,803,3 -6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8 -6143,view_1098,call_function,view.default,backward,7,1,1,1,5317,803,2 -6144,view_as_complex_97,call_function,view_as_complex.default,backward,7,1,1,1,5318,802,6 -6145,_conj_41,call_function,_conj.default,backward,7,1,1,1,4,803,3 -6146,clone_167,call_function,clone.default,backward,7,1,1,1,5,802,3 -6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8 -6148,view_as_real_96,call_function,view_as_real.default,backward,7,1,1,1,5322,801,6 -6149,view_1099,call_function,view.default,backward,7,1,1,1,5323,800,6 -6150,convert_element_type_1611,call_function,convert_element_type.default,backward,7,1,1,1,5324,799,6 -6151,view_as_real_97,call_function,view_as_real.default,backward,7,1,1,1,5322,800,6 -6152,view_1100,call_function,view.default,backward,7,1,1,1,5323,799,6 -6153,convert_element_type_1612,call_function,convert_element_type.default,backward,7,1,1,1,5324,798,6 -6154,view_1101,call_function,view.default,backward,7,1,1,1,5316,798,2 -6155,view_1102,call_function,view.default,backward,7,1,1,1,5325,798,5 -6156,view_1103,call_function,view.default,backward,7,1,1,1,5325,797,5 -6157,alias_default_1127,call_function,alias.default,backward,7,1,1,2,5317,797,4 -6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5 -6159,permute_975,call_function,permute.default,backward,7,1,1,1,4,793,3 -6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5 -6161,permute_976,call_function,permute.default,backward,7,1,1,1,5319,2,4 -6162,dtype_cast_442,call_function,dtype_cast.default,backward,7,1,1,1,5320,1,4 -6163,alias_default_1312,call_function,alias.default,backward,7,1,1,0,5321,0,3 -6164,alias_default_1128,call_function,alias.default,backward,7,1,1,2,5326,797,4 -6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5 -6166,permute_979,call_function,permute.default,backward,7,1,1,1,4,793,3 -6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5 -6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10 -6169,permute_980,call_function,permute.default,backward,7,1,1,1,5328,2,4 -6170,dtype_cast_443,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4 -6171,alias_default_1311,call_function,alias.default,backward,7,1,1,0,5330,0,3 -6172,alias_default_1129,call_function,alias.default,backward,7,1,1,2,5326,796,4 -6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5 -6174,permute_983,call_function,permute.default,backward,7,1,1,1,4,792,3 -6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5 -6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10 -6177,permute_984,call_function,permute.default,backward,7,1,1,1,5328,2,4 -6178,dtype_cast_444,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4 -6179,alias_default_1310,call_function,alias.default,backward,7,1,1,0,5330,0,3 -6180,convert_element_type_1625,call_function,convert_element_type.default,backward,7,1,1,1,5352,789,8 -6181,convert_element_type_1626,call_function,convert_element_type.default,backward,7,1,1,1,779,789,4 -6182,convert_element_type_1627,call_function,convert_element_type.default,backward,7,1,1,1,3,783,2 -6183,alias_default_1130,call_function,alias.default,backward,7,1,1,2,5353,788,4 -6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8 -6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8 -6186,alias_default_1131,call_function,alias.default,backward,7,1,1,2,5356,781,4 -6187,alias_default_1132,call_function,alias.default,backward,7,1,1,3,788,787,4 -6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8 -6189,sum_85,call_function,sum.dim_IntList,backward,7,1,1,1,5361,779,5 -6190,div_70,call_function,div.Tensor,backward,7,1,1,1,789,779,6 -6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8 -6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10 -6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8 -6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8 -6195,sum_86,call_function,sum.dim_IntList,backward,7,1,1,1,5358,3,5 -6196,convert_element_type_1628,call_function,convert_element_type.default,backward,7,1,1,1,5366,775,6 -6197,convert_element_type_1629,call_function,convert_element_type.default,backward,7,1,1,1,5359,2,3 -6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10 -6199,dtype_cast_445,call_function,dtype_cast.default,backward,7,1,1,1,5360,1,3 -6200,alias_default_1317,call_function,alias.default,backward,7,1,1,0,5361,0,2 -6201,alias_default_1133,call_function,alias.default,unknown,,1,1,3,5368,773,4 -6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5 -6203,permute_987,call_function,permute.default,backward,6,1,1,1,4,769,3 -6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5 -6205,permute_988,call_function,permute.default,backward,6,1,1,1,5370,2,4 -6206,dtype_cast_446,call_function,dtype_cast.default,backward,6,1,1,1,5371,1,4 -6207,alias_default_1306,call_function,alias.default,backward,6,1,1,0,5372,0,3 -6208,alias_default_1134,call_function,alias.default,backward,6,1,1,2,5371,767,4 -6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8 -6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8 -6211,alias_default_1135,call_function,alias.default,backward,6,1,1,2,5373,754,4 -6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5 -6213,permute_991,call_function,permute.default,backward,6,1,1,1,4,750,3 -6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5 -6215,permute_992,call_function,permute.default,backward,6,1,1,1,5375,2,4 -6216,dtype_cast_447,call_function,dtype_cast.default,backward,6,1,1,1,5376,1,4 -6217,alias_default_1307,call_function,alias.default,backward,6,1,1,0,5377,0,3 -6218,convert_element_type_1638,call_function,convert_element_type.default,backward,6,1,1,1,5373,758,6 -6219,convert_element_type_1639,call_function,convert_element_type.default,backward,6,1,1,1,756,768,4 -6220,alias_default_1136,call_function,alias.default,backward,6,1,1,2,757,767,4 -6221,neg_49,call_function,neg.default,backward,6,1,1,1,758,766,8 -6222,exp_49,call_function,exp.default,backward,6,1,1,1,759,765,6 -6223,add_288,call_function,add.Tensor,backward,6,1,1,1,760,764,4 -6224,reciprocal_21,call_function,reciprocal.default,backward,6,1,1,1,761,763,4 -6225,mul_626,call_function,mul.Tensor,backward,6,1,1,1,762,762,6 -6226,alias_default_1137,call_function,alias.default,backward,6,1,1,2,763,761,4 -6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8 -6228,sub_64,call_function,sub.Tensor,backward,6,1,1,1,764,759,4 -6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8 -6230,add_289,call_function,add.Tensor,backward,6,1,1,1,766,757,4 -6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8 -6232,convert_element_type_1640,call_function,convert_element_type.default,backward,6,1,1,1,5387,755,6 -6233,alias_default_1138,call_function,alias.default,backward,6,1,1,2,5388,754,4 -6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5 -6235,permute_995,call_function,permute.default,backward,6,1,1,1,4,750,3 -6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5 -6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10 -6238,permute_996,call_function,permute.default,backward,6,1,1,1,5390,2,4 -6239,dtype_cast_448,call_function,dtype_cast.default,backward,6,1,1,1,5391,1,4 -6240,alias_default_1305,call_function,alias.default,backward,6,1,1,0,5392,0,3 -6241,convert_element_type_1645,call_function,convert_element_type.default,backward,6,1,1,1,5396,747,8 -6242,convert_element_type_1646,call_function,convert_element_type.default,backward,6,1,1,1,736,747,4 -6243,convert_element_type_1647,call_function,convert_element_type.default,backward,6,1,1,1,3,741,2 -6244,alias_default_1139,call_function,alias.default,backward,6,1,1,2,5397,746,4 -6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8 -6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8 -6247,alias_default_1140,call_function,alias.default,backward,6,1,1,2,5400,739,4 -6248,alias_default_1141,call_function,alias.default,backward,6,1,1,3,745,745,4 -6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8 -6250,sum_87,call_function,sum.dim_IntList,backward,6,1,1,1,5405,737,5 -6251,div_71,call_function,div.Tensor,backward,6,1,1,1,746,737,6 -6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8 -6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10 -6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8 -6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8 -6256,sum_88,call_function,sum.dim_IntList,backward,6,1,1,1,5402,3,5 -6257,convert_element_type_1648,call_function,convert_element_type.default,backward,6,1,1,1,5410,733,6 -6258,convert_element_type_1649,call_function,convert_element_type.default,backward,6,1,1,1,5403,2,3 -6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10 -6260,dtype_cast_449,call_function,dtype_cast.default,backward,6,1,1,1,5404,1,3 -6261,alias_default_1309,call_function,alias.default,backward,6,1,1,0,5405,0,2 -6262,alias_default_1142,call_function,alias.default,unknown,,1,1,3,5412,731,4 -6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5 -6264,permute_999,call_function,permute.default,backward,6,1,1,1,4,727,3 -6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5 -6266,permute_1000,call_function,permute.default,backward,6,1,1,1,5414,2,4 -6267,dtype_cast_450,call_function,dtype_cast.default,backward,6,1,1,1,5415,1,4 -6268,alias_default_1304,call_function,alias.default,backward,6,1,1,0,5416,0,3 -6269,view_1118,call_function,view.default,backward,6,1,1,1,5415,725,4 -6270,permute_1001,call_function,permute.default,backward,6,1,1,1,5416,724,4 -6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2 -6272,getitem_315,call_function,getitem,backward,6,1,1,1,5421,696,2 -6273,getitem_316,call_function,getitem,backward,6,1,1,1,5421,697,2 -6274,getitem_317,call_function,getitem,backward,6,1,1,1,5421,690,2 -6275,permute_1002,call_function,permute.default,backward,6,1,1,1,5422,689,2 -6276,permute_1003,call_function,permute.default,backward,6,1,1,1,5422,696,2 -6277,permute_1004,call_function,permute.default,backward,6,1,1,1,5422,695,2 -6278,convert_element_type_1654,call_function,convert_element_type.default,backward,6,1,1,1,5423,695,2 -6279,convert_element_type_1655,call_function,convert_element_type.default,backward,6,1,1,1,5423,694,2 -6280,view_1119,call_function,view.default,backward,6,1,1,1,5424,694,2 -6281,view_as_complex_98,call_function,view_as_complex.default,backward,6,1,1,1,5425,693,6 -6282,_conj_42,call_function,_conj.default,backward,6,1,1,1,4,694,3 -6283,clone_174,call_function,clone.default,backward,6,1,1,1,5,693,3 -6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8 -6285,view_1120,call_function,view.default,backward,6,1,1,1,5424,693,2 -6286,view_as_complex_99,call_function,view_as_complex.default,backward,6,1,1,1,5425,692,6 -6287,_conj_43,call_function,_conj.default,backward,6,1,1,1,4,693,3 -6288,clone_175,call_function,clone.default,backward,6,1,1,1,5,692,3 -6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8 -6290,view_as_real_98,call_function,view_as_real.default,backward,6,1,1,1,5429,691,6 -6291,view_1121,call_function,view.default,backward,6,1,1,1,5430,690,6 -6292,convert_element_type_1656,call_function,convert_element_type.default,backward,6,1,1,1,5431,689,6 -6293,view_as_real_99,call_function,view_as_real.default,backward,6,1,1,1,5429,690,6 -6294,view_1122,call_function,view.default,backward,6,1,1,1,5430,689,6 -6295,convert_element_type_1657,call_function,convert_element_type.default,backward,6,1,1,1,5431,688,6 -6296,view_1123,call_function,view.default,backward,6,1,1,1,5423,688,2 -6297,view_1124,call_function,view.default,backward,6,1,1,1,5432,688,5 -6298,view_1125,call_function,view.default,backward,6,1,1,1,5432,687,5 -6299,alias_default_1143,call_function,alias.default,backward,6,1,1,2,5424,687,4 -6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5 -6301,permute_1007,call_function,permute.default,backward,6,1,1,1,4,683,3 -6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5 -6303,permute_1008,call_function,permute.default,backward,6,1,1,1,5426,2,4 -6304,dtype_cast_451,call_function,dtype_cast.default,backward,6,1,1,1,5427,1,4 -6305,alias_default_1303,call_function,alias.default,backward,6,1,1,0,5428,0,3 -6306,alias_default_1144,call_function,alias.default,backward,6,1,1,2,5433,687,4 -6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5 -6308,permute_1011,call_function,permute.default,backward,6,1,1,1,4,683,3 -6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5 -6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10 -6311,permute_1012,call_function,permute.default,backward,6,1,1,1,5435,2,4 -6312,dtype_cast_452,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4 -6313,alias_default_1302,call_function,alias.default,backward,6,1,1,0,5437,0,3 -6314,alias_default_1145,call_function,alias.default,backward,6,1,1,2,5433,686,4 -6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5 -6316,permute_1015,call_function,permute.default,backward,6,1,1,1,4,682,3 -6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5 -6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10 -6319,permute_1016,call_function,permute.default,backward,6,1,1,1,5435,2,4 -6320,dtype_cast_453,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4 -6321,alias_default_1301,call_function,alias.default,backward,6,1,1,0,5437,0,3 -6322,convert_element_type_1670,call_function,convert_element_type.default,backward,6,1,1,1,5459,679,8 -6323,convert_element_type_1671,call_function,convert_element_type.default,backward,6,1,1,1,669,679,4 -6324,convert_element_type_1672,call_function,convert_element_type.default,backward,6,1,1,1,3,673,2 -6325,alias_default_1146,call_function,alias.default,backward,6,1,1,2,5460,678,4 -6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8 -6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8 -6328,alias_default_1147,call_function,alias.default,backward,6,1,1,2,5463,671,4 -6329,alias_default_1148,call_function,alias.default,backward,6,1,1,3,678,677,4 -6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8 -6331,sum_89,call_function,sum.dim_IntList,backward,6,1,1,1,5468,669,5 -6332,div_72,call_function,div.Tensor,backward,6,1,1,1,679,669,6 -6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8 -6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10 -6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8 -6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8 -6337,sum_90,call_function,sum.dim_IntList,backward,6,1,1,1,5465,3,5 -6338,convert_element_type_1673,call_function,convert_element_type.default,backward,6,1,1,1,5473,665,6 -6339,convert_element_type_1674,call_function,convert_element_type.default,backward,6,1,1,1,5466,2,3 -6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10 -6341,dtype_cast_454,call_function,dtype_cast.default,backward,6,1,1,1,5467,1,3 -6342,alias_default_1308,call_function,alias.default,backward,6,1,1,0,5468,0,2 -6343,alias_default_1149,call_function,alias.default,unknown,,1,1,3,5475,663,4 -6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5 -6345,permute_1019,call_function,permute.default,backward,5,1,1,1,4,659,3 -6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5 -6347,permute_1020,call_function,permute.default,backward,5,1,1,1,5477,2,4 -6348,dtype_cast_455,call_function,dtype_cast.default,backward,5,1,1,1,5478,1,4 -6349,alias_default_1297,call_function,alias.default,backward,5,1,1,0,5479,0,3 -6350,alias_default_1150,call_function,alias.default,backward,5,1,1,2,5478,657,4 -6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8 -6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8 -6353,alias_default_1151,call_function,alias.default,backward,5,1,1,2,5480,644,4 -6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5 -6355,permute_1023,call_function,permute.default,backward,5,1,1,1,4,640,3 -6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5 -6357,permute_1024,call_function,permute.default,backward,5,1,1,1,5482,2,4 -6358,dtype_cast_456,call_function,dtype_cast.default,backward,5,1,1,1,5483,1,4 -6359,alias_default_1298,call_function,alias.default,backward,5,1,1,0,5484,0,3 -6360,convert_element_type_1683,call_function,convert_element_type.default,backward,5,1,1,1,5480,648,6 -6361,convert_element_type_1684,call_function,convert_element_type.default,backward,5,1,1,1,646,658,4 -6362,alias_default_1152,call_function,alias.default,backward,5,1,1,2,647,657,4 -6363,neg_50,call_function,neg.default,backward,5,1,1,1,648,656,8 -6364,exp_50,call_function,exp.default,backward,5,1,1,1,649,655,6 -6365,add_295,call_function,add.Tensor,backward,5,1,1,1,650,654,4 -6366,reciprocal_22,call_function,reciprocal.default,backward,5,1,1,1,651,653,4 -6367,mul_646,call_function,mul.Tensor,backward,5,1,1,1,652,652,6 -6368,alias_default_1153,call_function,alias.default,backward,5,1,1,2,653,651,4 -6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8 -6370,sub_67,call_function,sub.Tensor,backward,5,1,1,1,654,649,4 -6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8 -6372,add_296,call_function,add.Tensor,backward,5,1,1,1,656,647,4 -6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8 -6374,convert_element_type_1685,call_function,convert_element_type.default,backward,5,1,1,1,5494,645,6 -6375,alias_default_1154,call_function,alias.default,backward,5,1,1,2,5495,644,4 -6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5 -6377,permute_1027,call_function,permute.default,backward,5,1,1,1,4,640,3 -6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5 -6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10 -6380,permute_1028,call_function,permute.default,backward,5,1,1,1,5497,2,4 -6381,dtype_cast_457,call_function,dtype_cast.default,backward,5,1,1,1,5498,1,4 -6382,alias_default_1296,call_function,alias.default,backward,5,1,1,0,5499,0,3 -6383,convert_element_type_1690,call_function,convert_element_type.default,backward,5,1,1,1,5503,637,8 -6384,convert_element_type_1691,call_function,convert_element_type.default,backward,5,1,1,1,626,637,4 -6385,convert_element_type_1692,call_function,convert_element_type.default,backward,5,1,1,1,3,631,2 -6386,alias_default_1155,call_function,alias.default,backward,5,1,1,2,5504,636,4 -6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8 -6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8 -6389,alias_default_1156,call_function,alias.default,backward,5,1,1,2,5507,629,4 -6390,alias_default_1157,call_function,alias.default,backward,5,1,1,3,635,635,4 -6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8 -6392,sum_91,call_function,sum.dim_IntList,backward,5,1,1,1,5512,627,5 -6393,div_73,call_function,div.Tensor,backward,5,1,1,1,636,627,6 -6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8 -6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10 -6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8 -6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8 -6398,sum_92,call_function,sum.dim_IntList,backward,5,1,1,1,5509,3,5 -6399,convert_element_type_1693,call_function,convert_element_type.default,backward,5,1,1,1,5517,623,6 -6400,convert_element_type_1694,call_function,convert_element_type.default,backward,5,1,1,1,5510,2,3 -6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10 -6402,dtype_cast_458,call_function,dtype_cast.default,backward,5,1,1,1,5511,1,3 -6403,alias_default_1300,call_function,alias.default,backward,5,1,1,0,5512,0,2 -6404,alias_default_1158,call_function,alias.default,unknown,,1,1,3,5519,621,4 -6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5 -6406,permute_1031,call_function,permute.default,backward,5,1,1,1,4,617,3 -6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5 -6408,permute_1032,call_function,permute.default,backward,5,1,1,1,5521,2,4 -6409,dtype_cast_459,call_function,dtype_cast.default,backward,5,1,1,1,5522,1,4 -6410,alias_default_1295,call_function,alias.default,backward,5,1,1,0,5523,0,3 -6411,view_1140,call_function,view.default,backward,5,1,1,1,5522,615,4 -6412,permute_1033,call_function,permute.default,backward,5,1,1,1,5523,614,4 -6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2 -6414,getitem_318,call_function,getitem,backward,5,1,1,1,5528,586,2 -6415,getitem_319,call_function,getitem,backward,5,1,1,1,5528,587,2 -6416,getitem_320,call_function,getitem,backward,5,1,1,1,5528,580,2 -6417,permute_1034,call_function,permute.default,backward,5,1,1,1,5529,579,2 -6418,permute_1035,call_function,permute.default,backward,5,1,1,1,5529,586,2 -6419,permute_1036,call_function,permute.default,backward,5,1,1,1,5529,585,2 -6420,convert_element_type_1699,call_function,convert_element_type.default,backward,5,1,1,1,5530,585,2 -6421,convert_element_type_1700,call_function,convert_element_type.default,backward,5,1,1,1,5530,584,2 -6422,view_1141,call_function,view.default,backward,5,1,1,1,5531,584,2 -6423,view_as_complex_100,call_function,view_as_complex.default,backward,5,1,1,1,5532,583,6 -6424,_conj_44,call_function,_conj.default,backward,5,1,1,1,4,584,3 -6425,clone_182,call_function,clone.default,backward,5,1,1,1,5,583,3 -6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8 -6427,view_1142,call_function,view.default,backward,5,1,1,1,5531,583,2 -6428,view_as_complex_101,call_function,view_as_complex.default,backward,5,1,1,1,5532,582,6 -6429,_conj_45,call_function,_conj.default,backward,5,1,1,1,4,583,3 -6430,clone_183,call_function,clone.default,backward,5,1,1,1,5,582,3 -6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8 -6432,view_as_real_100,call_function,view_as_real.default,backward,5,1,1,1,5536,581,6 -6433,view_1143,call_function,view.default,backward,5,1,1,1,5537,580,6 -6434,convert_element_type_1701,call_function,convert_element_type.default,backward,5,1,1,1,5538,579,6 -6435,view_as_real_101,call_function,view_as_real.default,backward,5,1,1,1,5536,580,6 -6436,view_1144,call_function,view.default,backward,5,1,1,1,5537,579,6 -6437,convert_element_type_1702,call_function,convert_element_type.default,backward,5,1,1,1,5538,578,6 -6438,view_1145,call_function,view.default,backward,5,1,1,1,5530,578,2 -6439,view_1146,call_function,view.default,backward,5,1,1,1,5539,578,5 -6440,view_1147,call_function,view.default,backward,5,1,1,1,5539,577,5 -6441,alias_default_1159,call_function,alias.default,backward,5,1,1,2,5531,577,4 -6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5 -6443,permute_1039,call_function,permute.default,backward,5,1,1,1,4,573,3 -6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5 -6445,permute_1040,call_function,permute.default,backward,5,1,1,1,5533,2,4 -6446,dtype_cast_460,call_function,dtype_cast.default,backward,5,1,1,1,5534,1,4 -6447,alias_default_1294,call_function,alias.default,backward,5,1,1,0,5535,0,3 -6448,alias_default_1160,call_function,alias.default,backward,5,1,1,2,5540,577,4 -6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5 -6450,permute_1043,call_function,permute.default,backward,5,1,1,1,4,573,3 -6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5 -6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10 -6453,permute_1044,call_function,permute.default,backward,5,1,1,1,5542,2,4 -6454,dtype_cast_461,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4 -6455,alias_default_1293,call_function,alias.default,backward,5,1,1,0,5544,0,3 -6456,alias_default_1161,call_function,alias.default,backward,5,1,1,2,5540,576,4 -6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5 -6458,permute_1047,call_function,permute.default,backward,5,1,1,1,4,572,3 -6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5 -6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10 -6461,permute_1048,call_function,permute.default,backward,5,1,1,1,5542,2,4 -6462,dtype_cast_462,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4 -6463,alias_default_1292,call_function,alias.default,backward,5,1,1,0,5544,0,3 -6464,convert_element_type_1715,call_function,convert_element_type.default,backward,5,1,1,1,5566,569,8 -6465,convert_element_type_1716,call_function,convert_element_type.default,backward,5,1,1,1,559,569,4 -6466,convert_element_type_1717,call_function,convert_element_type.default,backward,5,1,1,1,3,563,2 -6467,alias_default_1162,call_function,alias.default,backward,5,1,1,2,5567,568,4 -6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8 -6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8 -6470,alias_default_1163,call_function,alias.default,backward,5,1,1,2,5570,561,4 -6471,alias_default_1164,call_function,alias.default,backward,5,1,1,3,568,567,4 -6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8 -6473,sum_93,call_function,sum.dim_IntList,backward,5,1,1,1,5575,559,5 -6474,div_74,call_function,div.Tensor,backward,5,1,1,1,569,559,6 -6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8 -6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10 -6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8 -6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8 -6479,sum_94,call_function,sum.dim_IntList,backward,5,1,1,1,5572,3,5 -6480,convert_element_type_1718,call_function,convert_element_type.default,backward,5,1,1,1,5580,555,6 -6481,convert_element_type_1719,call_function,convert_element_type.default,backward,5,1,1,1,5573,2,3 -6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10 -6483,dtype_cast_463,call_function,dtype_cast.default,backward,5,1,1,1,5574,1,3 -6484,alias_default_1299,call_function,alias.default,backward,5,1,1,0,5575,0,2 -6485,alias_default_1165,call_function,alias.default,unknown,,1,1,3,5582,553,4 -6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5 -6487,permute_1051,call_function,permute.default,backward,4,1,1,1,4,549,3 -6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5 -6489,permute_1052,call_function,permute.default,backward,4,1,1,1,5584,2,4 -6490,dtype_cast_464,call_function,dtype_cast.default,backward,4,1,1,1,5585,1,4 -6491,alias_default_1288,call_function,alias.default,backward,4,1,1,0,5586,0,3 -6492,alias_default_1166,call_function,alias.default,backward,4,1,1,2,5585,547,4 -6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8 -6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8 -6495,alias_default_1167,call_function,alias.default,backward,4,1,1,2,5587,534,4 -6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5 -6497,permute_1055,call_function,permute.default,backward,4,1,1,1,4,530,3 -6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5 -6499,permute_1056,call_function,permute.default,backward,4,1,1,1,5589,2,4 -6500,dtype_cast_465,call_function,dtype_cast.default,backward,4,1,1,1,5590,1,4 -6501,alias_default_1289,call_function,alias.default,backward,4,1,1,0,5591,0,3 -6502,convert_element_type_1728,call_function,convert_element_type.default,backward,4,1,1,1,5587,538,6 -6503,convert_element_type_1729,call_function,convert_element_type.default,backward,4,1,1,1,536,548,4 -6504,alias_default_1168,call_function,alias.default,backward,4,1,1,2,537,547,4 -6505,neg_51,call_function,neg.default,backward,4,1,1,1,538,546,8 -6506,exp_51,call_function,exp.default,backward,4,1,1,1,539,545,6 -6507,add_302,call_function,add.Tensor,backward,4,1,1,1,540,544,4 -6508,reciprocal_23,call_function,reciprocal.default,backward,4,1,1,1,541,543,4 -6509,mul_666,call_function,mul.Tensor,backward,4,1,1,1,542,542,6 -6510,alias_default_1169,call_function,alias.default,backward,4,1,1,2,543,541,4 -6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8 -6512,sub_70,call_function,sub.Tensor,backward,4,1,1,1,544,539,4 -6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8 -6514,add_303,call_function,add.Tensor,backward,4,1,1,1,546,537,4 -6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8 -6516,convert_element_type_1730,call_function,convert_element_type.default,backward,4,1,1,1,5601,535,6 -6517,alias_default_1170,call_function,alias.default,backward,4,1,1,2,5602,534,4 -6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5 -6519,permute_1059,call_function,permute.default,backward,4,1,1,1,4,530,3 -6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5 -6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10 -6522,permute_1060,call_function,permute.default,backward,4,1,1,1,5604,2,4 -6523,dtype_cast_466,call_function,dtype_cast.default,backward,4,1,1,1,5605,1,4 -6524,alias_default_1287,call_function,alias.default,backward,4,1,1,0,5606,0,3 -6525,convert_element_type_1735,call_function,convert_element_type.default,backward,4,1,1,1,5610,527,8 -6526,convert_element_type_1736,call_function,convert_element_type.default,backward,4,1,1,1,516,527,4 -6527,convert_element_type_1737,call_function,convert_element_type.default,backward,4,1,1,1,3,521,2 -6528,alias_default_1171,call_function,alias.default,backward,4,1,1,2,5611,526,4 -6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8 -6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8 -6531,alias_default_1172,call_function,alias.default,backward,4,1,1,2,5614,519,4 -6532,alias_default_1173,call_function,alias.default,backward,4,1,1,3,525,525,4 -6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8 -6534,sum_95,call_function,sum.dim_IntList,backward,4,1,1,1,5619,517,5 -6535,div_75,call_function,div.Tensor,backward,4,1,1,1,526,517,6 -6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8 -6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10 -6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8 -6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8 -6540,sum_96,call_function,sum.dim_IntList,backward,4,1,1,1,5616,3,5 -6541,convert_element_type_1738,call_function,convert_element_type.default,backward,4,1,1,1,5624,513,6 -6542,convert_element_type_1739,call_function,convert_element_type.default,backward,4,1,1,1,5617,2,3 -6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10 -6544,dtype_cast_467,call_function,dtype_cast.default,backward,4,1,1,1,5618,1,3 -6545,alias_default_1291,call_function,alias.default,backward,4,1,1,0,5619,0,2 -6546,alias_default_1174,call_function,alias.default,unknown,,1,1,3,5626,511,4 -6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5 -6548,permute_1063,call_function,permute.default,backward,4,1,1,1,4,507,3 -6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5 -6550,permute_1064,call_function,permute.default,backward,4,1,1,1,5628,2,4 -6551,dtype_cast_468,call_function,dtype_cast.default,backward,4,1,1,1,5629,1,4 -6552,alias_default_1286,call_function,alias.default,backward,4,1,1,0,5630,0,3 -6553,view_1162,call_function,view.default,backward,4,1,1,1,5629,505,4 -6554,permute_1065,call_function,permute.default,backward,4,1,1,1,5630,504,4 -6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2 -6556,getitem_321,call_function,getitem,backward,4,1,1,1,5635,476,2 -6557,getitem_322,call_function,getitem,backward,4,1,1,1,5635,477,2 -6558,getitem_323,call_function,getitem,backward,4,1,1,1,5635,470,2 -6559,permute_1066,call_function,permute.default,backward,4,1,1,1,5636,469,2 -6560,permute_1067,call_function,permute.default,backward,4,1,1,1,5636,476,2 -6561,permute_1068,call_function,permute.default,backward,4,1,1,1,5636,475,2 -6562,convert_element_type_1744,call_function,convert_element_type.default,backward,4,1,1,1,5637,475,2 -6563,convert_element_type_1745,call_function,convert_element_type.default,backward,4,1,1,1,5637,474,2 -6564,view_1163,call_function,view.default,backward,4,1,1,1,5638,474,2 -6565,view_as_complex_102,call_function,view_as_complex.default,backward,4,1,1,1,5639,473,6 -6566,_conj_46,call_function,_conj.default,backward,4,1,1,1,4,474,3 -6567,clone_190,call_function,clone.default,backward,4,1,1,1,5,473,3 -6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8 -6569,view_1164,call_function,view.default,backward,4,1,1,1,5638,473,2 -6570,view_as_complex_103,call_function,view_as_complex.default,backward,4,1,1,1,5639,472,6 -6571,_conj_47,call_function,_conj.default,backward,4,1,1,1,4,473,3 -6572,clone_191,call_function,clone.default,backward,4,1,1,1,5,472,3 -6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8 -6574,view_as_real_102,call_function,view_as_real.default,backward,4,1,1,1,5643,471,6 -6575,view_1165,call_function,view.default,backward,4,1,1,1,5644,470,6 -6576,convert_element_type_1746,call_function,convert_element_type.default,backward,4,1,1,1,5645,469,6 -6577,view_as_real_103,call_function,view_as_real.default,backward,4,1,1,1,5643,470,6 -6578,view_1166,call_function,view.default,backward,4,1,1,1,5644,469,6 -6579,convert_element_type_1747,call_function,convert_element_type.default,backward,4,1,1,1,5645,468,6 -6580,view_1167,call_function,view.default,backward,4,1,1,1,5637,468,2 -6581,view_1168,call_function,view.default,backward,4,1,1,1,5646,468,5 -6582,view_1169,call_function,view.default,backward,4,1,1,1,5646,467,5 -6583,alias_default_1175,call_function,alias.default,backward,4,1,1,2,5638,467,4 -6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5 -6585,permute_1071,call_function,permute.default,backward,4,1,1,1,4,463,3 -6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5 -6587,permute_1072,call_function,permute.default,backward,4,1,1,1,5640,2,4 -6588,dtype_cast_469,call_function,dtype_cast.default,backward,4,1,1,1,5641,1,4 -6589,alias_default_1285,call_function,alias.default,backward,4,1,1,0,5642,0,3 -6590,alias_default_1176,call_function,alias.default,backward,4,1,1,2,5647,467,4 -6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5 -6592,permute_1075,call_function,permute.default,backward,4,1,1,1,4,463,3 -6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5 -6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10 -6595,permute_1076,call_function,permute.default,backward,4,1,1,1,5649,2,4 -6596,dtype_cast_470,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4 -6597,alias_default_1284,call_function,alias.default,backward,4,1,1,0,5651,0,3 -6598,alias_default_1177,call_function,alias.default,backward,4,1,1,2,5647,466,4 -6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5 -6600,permute_1079,call_function,permute.default,backward,4,1,1,1,4,462,3 -6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5 -6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10 -6603,permute_1080,call_function,permute.default,backward,4,1,1,1,5649,2,4 -6604,dtype_cast_471,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4 -6605,alias_default_1283,call_function,alias.default,backward,4,1,1,0,5651,0,3 -6606,convert_element_type_1760,call_function,convert_element_type.default,backward,4,1,1,1,5673,459,8 -6607,convert_element_type_1761,call_function,convert_element_type.default,backward,4,1,1,1,449,459,4 -6608,convert_element_type_1762,call_function,convert_element_type.default,backward,4,1,1,1,3,453,2 -6609,alias_default_1178,call_function,alias.default,backward,4,1,1,2,5674,458,4 -6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8 -6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8 -6612,alias_default_1179,call_function,alias.default,backward,4,1,1,2,5677,451,4 -6613,alias_default_1180,call_function,alias.default,backward,4,1,1,3,458,457,4 -6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8 -6615,sum_97,call_function,sum.dim_IntList,backward,4,1,1,1,5682,449,5 -6616,div_76,call_function,div.Tensor,backward,4,1,1,1,459,449,6 -6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8 -6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10 -6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8 -6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8 -6621,sum_98,call_function,sum.dim_IntList,backward,4,1,1,1,5679,3,5 -6622,convert_element_type_1763,call_function,convert_element_type.default,backward,4,1,1,1,5687,445,6 -6623,convert_element_type_1764,call_function,convert_element_type.default,backward,4,1,1,1,5680,2,3 -6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10 -6625,dtype_cast_472,call_function,dtype_cast.default,backward,4,1,1,1,5681,1,3 -6626,alias_default_1290,call_function,alias.default,backward,4,1,1,0,5682,0,2 -6627,alias_default_1181,call_function,alias.default,unknown,,1,1,3,5689,443,4 -6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5 -6629,permute_1083,call_function,permute.default,backward,3,1,1,1,4,439,3 -6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5 -6631,permute_1084,call_function,permute.default,backward,3,1,1,1,5691,2,4 -6632,dtype_cast_473,call_function,dtype_cast.default,backward,3,1,1,1,5692,1,4 -6633,alias_default_1279,call_function,alias.default,backward,3,1,1,0,5693,0,3 -6634,alias_default_1182,call_function,alias.default,backward,3,1,1,2,5692,437,4 -6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8 -6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8 -6637,alias_default_1183,call_function,alias.default,backward,3,1,1,2,5694,424,4 -6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5 -6639,permute_1087,call_function,permute.default,backward,3,1,1,1,4,420,3 -6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5 -6641,permute_1088,call_function,permute.default,backward,3,1,1,1,5696,2,4 -6642,dtype_cast_474,call_function,dtype_cast.default,backward,3,1,1,1,5697,1,4 -6643,alias_default_1280,call_function,alias.default,backward,3,1,1,0,5698,0,3 -6644,convert_element_type_1773,call_function,convert_element_type.default,backward,3,1,1,1,5694,428,6 -6645,convert_element_type_1774,call_function,convert_element_type.default,backward,3,1,1,1,426,438,4 -6646,alias_default_1184,call_function,alias.default,backward,3,1,1,2,427,437,4 -6647,neg_52,call_function,neg.default,backward,3,1,1,1,428,436,8 -6648,exp_52,call_function,exp.default,backward,3,1,1,1,429,435,6 -6649,add_309,call_function,add.Tensor,backward,3,1,1,1,430,434,4 -6650,reciprocal_24,call_function,reciprocal.default,backward,3,1,1,1,431,433,4 -6651,mul_686,call_function,mul.Tensor,backward,3,1,1,1,432,432,6 -6652,alias_default_1185,call_function,alias.default,backward,3,1,1,2,433,431,4 -6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8 -6654,sub_73,call_function,sub.Tensor,backward,3,1,1,1,434,429,4 -6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8 -6656,add_310,call_function,add.Tensor,backward,3,1,1,1,436,427,4 -6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8 -6658,convert_element_type_1775,call_function,convert_element_type.default,backward,3,1,1,1,5708,425,6 -6659,alias_default_1186,call_function,alias.default,backward,3,1,1,2,5709,424,4 -6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5 -6661,permute_1091,call_function,permute.default,backward,3,1,1,1,4,420,3 -6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5 -6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10 -6664,permute_1092,call_function,permute.default,backward,3,1,1,1,5711,2,4 -6665,dtype_cast_475,call_function,dtype_cast.default,backward,3,1,1,1,5712,1,4 -6666,alias_default_1278,call_function,alias.default,backward,3,1,1,0,5713,0,3 -6667,convert_element_type_1780,call_function,convert_element_type.default,backward,3,1,1,1,5717,417,8 -6668,convert_element_type_1781,call_function,convert_element_type.default,backward,3,1,1,1,406,417,4 -6669,convert_element_type_1782,call_function,convert_element_type.default,backward,3,1,1,1,3,411,2 -6670,alias_default_1187,call_function,alias.default,backward,3,1,1,2,5718,416,4 -6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8 -6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8 -6673,alias_default_1188,call_function,alias.default,backward,3,1,1,2,5721,409,4 -6674,alias_default_1189,call_function,alias.default,backward,3,1,1,3,415,415,4 -6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8 -6676,sum_99,call_function,sum.dim_IntList,backward,3,1,1,1,5726,407,5 -6677,div_77,call_function,div.Tensor,backward,3,1,1,1,416,407,6 -6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8 -6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10 -6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8 -6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8 -6682,sum_100,call_function,sum.dim_IntList,backward,3,1,1,1,5723,3,5 -6683,convert_element_type_1783,call_function,convert_element_type.default,backward,3,1,1,1,5731,403,6 -6684,convert_element_type_1784,call_function,convert_element_type.default,backward,3,1,1,1,5724,2,3 -6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10 -6686,dtype_cast_476,call_function,dtype_cast.default,backward,3,1,1,1,5725,1,3 -6687,alias_default_1282,call_function,alias.default,backward,3,1,1,0,5726,0,2 -6688,alias_default_1190,call_function,alias.default,unknown,,1,1,3,5733,401,4 -6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5 -6690,permute_1095,call_function,permute.default,backward,3,1,1,1,4,397,3 -6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5 -6692,permute_1096,call_function,permute.default,backward,3,1,1,1,5735,2,4 -6693,dtype_cast_477,call_function,dtype_cast.default,backward,3,1,1,1,5736,1,4 -6694,alias_default_1277,call_function,alias.default,backward,3,1,1,0,5737,0,3 -6695,view_1184,call_function,view.default,backward,3,1,1,1,5736,395,4 -6696,permute_1097,call_function,permute.default,backward,3,1,1,1,5737,394,4 -6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2 -6698,getitem_324,call_function,getitem,backward,3,1,1,1,5742,366,2 -6699,getitem_325,call_function,getitem,backward,3,1,1,1,5742,367,2 -6700,getitem_326,call_function,getitem,backward,3,1,1,1,5742,360,2 -6701,permute_1098,call_function,permute.default,backward,3,1,1,1,5743,359,2 -6702,permute_1099,call_function,permute.default,backward,3,1,1,1,5743,366,2 -6703,permute_1100,call_function,permute.default,backward,3,1,1,1,5743,365,2 -6704,convert_element_type_1789,call_function,convert_element_type.default,backward,3,1,1,1,5744,365,2 -6705,convert_element_type_1790,call_function,convert_element_type.default,backward,3,1,1,1,5744,364,2 -6706,view_1185,call_function,view.default,backward,3,1,1,1,5745,364,2 -6707,view_as_complex_104,call_function,view_as_complex.default,backward,3,1,1,1,5746,363,6 -6708,_conj_48,call_function,_conj.default,backward,3,1,1,1,4,364,3 -6709,clone_198,call_function,clone.default,backward,3,1,1,1,5,363,3 -6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8 -6711,view_1186,call_function,view.default,backward,3,1,1,1,5745,363,2 -6712,view_as_complex_105,call_function,view_as_complex.default,backward,3,1,1,1,5746,362,6 -6713,_conj_49,call_function,_conj.default,backward,3,1,1,1,4,363,3 -6714,clone_199,call_function,clone.default,backward,3,1,1,1,5,362,3 -6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8 -6716,view_as_real_104,call_function,view_as_real.default,backward,3,1,1,1,5750,361,6 -6717,view_1187,call_function,view.default,backward,3,1,1,1,5751,360,6 -6718,convert_element_type_1791,call_function,convert_element_type.default,backward,3,1,1,1,5752,359,6 -6719,view_as_real_105,call_function,view_as_real.default,backward,3,1,1,1,5750,360,6 -6720,view_1188,call_function,view.default,backward,3,1,1,1,5751,359,6 -6721,convert_element_type_1792,call_function,convert_element_type.default,backward,3,1,1,1,5752,358,6 -6722,view_1189,call_function,view.default,backward,3,1,1,1,5744,358,2 -6723,view_1190,call_function,view.default,backward,3,1,1,1,5753,358,5 -6724,view_1191,call_function,view.default,backward,3,1,1,1,5753,357,5 -6725,alias_default_1191,call_function,alias.default,backward,3,1,1,2,5745,357,4 -6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5 -6727,permute_1103,call_function,permute.default,backward,3,1,1,1,4,353,3 -6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5 -6729,permute_1104,call_function,permute.default,backward,3,1,1,1,5747,2,4 -6730,dtype_cast_478,call_function,dtype_cast.default,backward,3,1,1,1,5748,1,4 -6731,alias_default_1276,call_function,alias.default,backward,3,1,1,0,5749,0,3 -6732,alias_default_1192,call_function,alias.default,backward,3,1,1,2,5754,357,4 -6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5 -6734,permute_1107,call_function,permute.default,backward,3,1,1,1,4,353,3 -6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5 -6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10 -6737,permute_1108,call_function,permute.default,backward,3,1,1,1,5756,2,4 -6738,dtype_cast_479,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4 -6739,alias_default_1275,call_function,alias.default,backward,3,1,1,0,5758,0,3 -6740,alias_default_1193,call_function,alias.default,backward,3,1,1,2,5754,356,4 -6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5 -6742,permute_1111,call_function,permute.default,backward,3,1,1,1,4,352,3 -6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5 -6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10 -6745,permute_1112,call_function,permute.default,backward,3,1,1,1,5756,2,4 -6746,dtype_cast_480,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4 -6747,alias_default_1274,call_function,alias.default,backward,3,1,1,0,5758,0,3 -6748,convert_element_type_1805,call_function,convert_element_type.default,backward,3,1,1,1,5780,349,8 -6749,convert_element_type_1806,call_function,convert_element_type.default,backward,3,1,1,1,339,349,4 -6750,convert_element_type_1807,call_function,convert_element_type.default,backward,3,1,1,1,3,343,2 -6751,alias_default_1194,call_function,alias.default,backward,3,1,1,2,5781,348,4 -6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8 -6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8 -6754,alias_default_1195,call_function,alias.default,backward,3,1,1,2,5784,341,4 -6755,alias_default_1196,call_function,alias.default,backward,3,1,1,3,348,347,4 -6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8 -6757,sum_101,call_function,sum.dim_IntList,backward,3,1,1,1,5789,339,5 -6758,div_78,call_function,div.Tensor,backward,3,1,1,1,349,339,6 -6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8 -6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10 -6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8 -6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8 -6763,sum_102,call_function,sum.dim_IntList,backward,3,1,1,1,5786,3,5 -6764,convert_element_type_1808,call_function,convert_element_type.default,backward,3,1,1,1,5794,335,6 -6765,convert_element_type_1809,call_function,convert_element_type.default,backward,3,1,1,1,5787,2,3 -6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10 -6767,dtype_cast_481,call_function,dtype_cast.default,backward,3,1,1,1,5788,1,3 -6768,alias_default_1281,call_function,alias.default,backward,3,1,1,0,5789,0,2 -6769,alias_default_1197,call_function,alias.default,unknown,,1,1,3,5796,333,4 -6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5 -6771,permute_1115,call_function,permute.default,backward,2,1,1,1,4,329,3 -6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5 -6773,permute_1116,call_function,permute.default,backward,2,1,1,1,5798,2,4 -6774,dtype_cast_482,call_function,dtype_cast.default,backward,2,1,1,1,5799,1,4 -6775,alias_default_1270,call_function,alias.default,backward,2,1,1,0,5800,0,3 -6776,alias_default_1198,call_function,alias.default,backward,2,1,1,2,5799,327,4 -6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8 -6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8 -6779,alias_default_1199,call_function,alias.default,backward,2,1,1,2,5801,314,4 -6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5 -6781,permute_1119,call_function,permute.default,backward,2,1,1,1,4,310,3 -6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5 -6783,permute_1120,call_function,permute.default,backward,2,1,1,1,5803,2,4 -6784,dtype_cast_483,call_function,dtype_cast.default,backward,2,1,1,1,5804,1,4 -6785,alias_default_1271,call_function,alias.default,backward,2,1,1,0,5805,0,3 -6786,convert_element_type_1818,call_function,convert_element_type.default,backward,2,1,1,1,5801,318,6 -6787,convert_element_type_1819,call_function,convert_element_type.default,backward,2,1,1,1,316,328,4 -6788,alias_default_1200,call_function,alias.default,backward,2,1,1,2,317,327,4 -6789,neg_53,call_function,neg.default,backward,2,1,1,1,318,326,8 -6790,exp_53,call_function,exp.default,backward,2,1,1,1,319,325,6 -6791,add_316,call_function,add.Tensor,backward,2,1,1,1,320,324,4 -6792,reciprocal_25,call_function,reciprocal.default,backward,2,1,1,1,321,323,4 -6793,mul_706,call_function,mul.Tensor,backward,2,1,1,1,322,322,6 -6794,alias_default_1201,call_function,alias.default,backward,2,1,1,2,323,321,4 -6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8 -6796,sub_76,call_function,sub.Tensor,backward,2,1,1,1,324,319,4 -6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8 -6798,add_317,call_function,add.Tensor,backward,2,1,1,1,326,317,4 -6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8 -6800,convert_element_type_1820,call_function,convert_element_type.default,backward,2,1,1,1,5815,315,6 -6801,alias_default_1202,call_function,alias.default,backward,2,1,1,2,5816,314,4 -6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5 -6803,permute_1123,call_function,permute.default,backward,2,1,1,1,4,310,3 -6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5 -6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10 -6806,permute_1124,call_function,permute.default,backward,2,1,1,1,5818,2,4 -6807,dtype_cast_484,call_function,dtype_cast.default,backward,2,1,1,1,5819,1,4 -6808,alias_default_1269,call_function,alias.default,backward,2,1,1,0,5820,0,3 -6809,convert_element_type_1825,call_function,convert_element_type.default,backward,2,1,1,1,5824,307,8 -6810,convert_element_type_1826,call_function,convert_element_type.default,backward,2,1,1,1,296,307,4 -6811,convert_element_type_1827,call_function,convert_element_type.default,backward,2,1,1,1,3,301,2 -6812,alias_default_1203,call_function,alias.default,backward,2,1,1,2,5825,306,4 -6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8 -6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8 -6815,alias_default_1204,call_function,alias.default,backward,2,1,1,2,5828,299,4 -6816,alias_default_1205,call_function,alias.default,backward,2,1,1,3,305,305,4 -6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8 -6818,sum_103,call_function,sum.dim_IntList,backward,2,1,1,1,5833,297,5 -6819,div_79,call_function,div.Tensor,backward,2,1,1,1,306,297,6 -6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8 -6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10 -6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8 -6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8 -6824,sum_104,call_function,sum.dim_IntList,backward,2,1,1,1,5830,3,5 -6825,convert_element_type_1828,call_function,convert_element_type.default,backward,2,1,1,1,5838,293,6 -6826,convert_element_type_1829,call_function,convert_element_type.default,backward,2,1,1,1,5831,2,3 -6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10 -6828,dtype_cast_485,call_function,dtype_cast.default,backward,2,1,1,1,5832,1,3 -6829,alias_default_1273,call_function,alias.default,backward,2,1,1,0,5833,0,2 -6830,alias_default_1206,call_function,alias.default,unknown,,1,1,3,5840,291,4 -6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5 -6832,permute_1127,call_function,permute.default,backward,2,1,1,1,4,287,3 -6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5 -6834,permute_1128,call_function,permute.default,backward,2,1,1,1,5842,2,4 -6835,dtype_cast_486,call_function,dtype_cast.default,backward,2,1,1,1,5843,1,4 -6836,alias_default_1268,call_function,alias.default,backward,2,1,1,0,5844,0,3 -6837,view_1206,call_function,view.default,backward,2,1,1,1,5843,285,4 -6838,permute_1129,call_function,permute.default,backward,2,1,1,1,5844,284,4 -6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2 -6840,getitem_327,call_function,getitem,backward,2,1,1,1,5849,256,2 -6841,getitem_328,call_function,getitem,backward,2,1,1,1,5849,257,2 -6842,getitem_329,call_function,getitem,backward,2,1,1,1,5849,250,2 -6843,permute_1130,call_function,permute.default,backward,2,1,1,1,5850,249,2 -6844,permute_1131,call_function,permute.default,backward,2,1,1,1,5850,256,2 -6845,permute_1132,call_function,permute.default,backward,2,1,1,1,5850,255,2 -6846,convert_element_type_1834,call_function,convert_element_type.default,backward,2,1,1,1,5851,255,2 -6847,convert_element_type_1835,call_function,convert_element_type.default,backward,2,1,1,1,5851,254,2 -6848,view_1207,call_function,view.default,backward,2,1,1,1,5852,254,2 -6849,view_as_complex_106,call_function,view_as_complex.default,backward,2,1,1,1,5853,253,6 -6850,_conj_50,call_function,_conj.default,backward,2,1,1,1,4,254,3 -6851,clone_206,call_function,clone.default,backward,2,1,1,1,5,253,3 -6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8 -6853,view_1208,call_function,view.default,backward,2,1,1,1,5852,253,2 -6854,view_as_complex_107,call_function,view_as_complex.default,backward,2,1,1,1,5853,252,6 -6855,_conj_51,call_function,_conj.default,backward,2,1,1,1,4,253,3 -6856,clone_207,call_function,clone.default,backward,2,1,1,1,5,252,3 -6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8 -6858,view_as_real_106,call_function,view_as_real.default,backward,2,1,1,1,5857,251,6 -6859,view_1209,call_function,view.default,backward,2,1,1,1,5858,250,6 -6860,convert_element_type_1836,call_function,convert_element_type.default,backward,2,1,1,1,5859,249,6 -6861,view_as_real_107,call_function,view_as_real.default,backward,2,1,1,1,5857,250,6 -6862,view_1210,call_function,view.default,backward,2,1,1,1,5858,249,6 -6863,convert_element_type_1837,call_function,convert_element_type.default,backward,2,1,1,1,5859,248,6 -6864,view_1211,call_function,view.default,backward,2,1,1,1,5851,248,2 -6865,view_1212,call_function,view.default,backward,2,1,1,1,5860,248,5 -6866,view_1213,call_function,view.default,backward,2,1,1,1,5860,247,5 -6867,alias_default_1207,call_function,alias.default,backward,2,1,1,2,5852,247,4 -6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5 -6869,permute_1135,call_function,permute.default,backward,2,1,1,1,4,243,3 -6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5 -6871,permute_1136,call_function,permute.default,backward,2,1,1,1,5854,2,4 -6872,dtype_cast_487,call_function,dtype_cast.default,backward,2,1,1,1,5855,1,4 -6873,alias_default_1267,call_function,alias.default,backward,2,1,1,0,5856,0,3 -6874,alias_default_1208,call_function,alias.default,backward,2,1,1,2,5861,247,4 -6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5 -6876,permute_1139,call_function,permute.default,backward,2,1,1,1,4,243,3 -6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5 -6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10 -6879,permute_1140,call_function,permute.default,backward,2,1,1,1,5863,2,4 -6880,dtype_cast_488,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4 -6881,alias_default_1266,call_function,alias.default,backward,2,1,1,0,5865,0,3 -6882,alias_default_1209,call_function,alias.default,backward,2,1,1,2,5861,246,4 -6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5 -6884,permute_1143,call_function,permute.default,backward,2,1,1,1,4,242,3 -6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5 -6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10 -6887,permute_1144,call_function,permute.default,backward,2,1,1,1,5863,2,4 -6888,dtype_cast_489,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4 -6889,alias_default_1265,call_function,alias.default,backward,2,1,1,0,5865,0,3 -6890,convert_element_type_1850,call_function,convert_element_type.default,backward,2,1,1,1,5887,239,8 -6891,convert_element_type_1851,call_function,convert_element_type.default,backward,2,1,1,1,229,239,4 -6892,convert_element_type_1852,call_function,convert_element_type.default,backward,2,1,1,1,3,233,2 -6893,alias_default_1210,call_function,alias.default,backward,2,1,1,2,5888,238,4 -6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8 -6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8 -6896,alias_default_1211,call_function,alias.default,backward,2,1,1,2,5891,231,4 -6897,alias_default_1212,call_function,alias.default,backward,2,1,1,3,238,237,4 -6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8 -6899,sum_105,call_function,sum.dim_IntList,backward,2,1,1,1,5896,229,5 -6900,div_80,call_function,div.Tensor,backward,2,1,1,1,239,229,6 -6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8 -6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10 -6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8 -6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8 -6905,sum_106,call_function,sum.dim_IntList,backward,2,1,1,1,5893,3,5 -6906,convert_element_type_1853,call_function,convert_element_type.default,backward,2,1,1,1,5901,225,6 -6907,convert_element_type_1854,call_function,convert_element_type.default,backward,2,1,1,1,5894,2,3 -6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10 -6909,dtype_cast_490,call_function,dtype_cast.default,backward,2,1,1,1,5895,1,3 -6910,alias_default_1272,call_function,alias.default,backward,2,1,1,0,5896,0,2 -6911,alias_default_1213,call_function,alias.default,unknown,,1,1,3,5903,223,4 -6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5 -6913,permute_1147,call_function,permute.default,backward,1,1,1,1,4,219,3 -6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5 -6915,permute_1148,call_function,permute.default,backward,1,1,1,1,5905,2,4 -6916,dtype_cast_491,call_function,dtype_cast.default,backward,1,1,1,1,5906,1,4 -6917,alias_default_1261,call_function,alias.default,backward,1,1,1,0,5907,0,3 -6918,alias_default_1214,call_function,alias.default,backward,1,1,1,2,5906,217,4 -6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8 -6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8 -6921,alias_default_1215,call_function,alias.default,backward,1,1,1,2,5908,204,4 -6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5 -6923,permute_1151,call_function,permute.default,backward,1,1,1,1,4,200,3 -6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5 -6925,permute_1152,call_function,permute.default,backward,1,1,1,1,5910,2,4 -6926,dtype_cast_492,call_function,dtype_cast.default,backward,1,1,1,1,5911,1,4 -6927,alias_default_1262,call_function,alias.default,backward,1,1,1,0,5912,0,3 -6928,convert_element_type_1863,call_function,convert_element_type.default,backward,1,1,1,1,5908,208,6 -6929,convert_element_type_1864,call_function,convert_element_type.default,backward,1,1,1,1,206,218,4 -6930,alias_default_1216,call_function,alias.default,backward,1,1,1,2,207,217,4 -6931,neg_54,call_function,neg.default,backward,1,1,1,1,208,216,8 -6932,exp_54,call_function,exp.default,backward,1,1,1,1,209,215,6 -6933,add_323,call_function,add.Tensor,backward,1,1,1,1,210,214,4 -6934,reciprocal_26,call_function,reciprocal.default,backward,1,1,1,1,211,213,4 -6935,mul_726,call_function,mul.Tensor,backward,1,1,1,1,212,212,6 -6936,alias_default_1217,call_function,alias.default,backward,1,1,1,2,213,211,4 -6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8 -6938,sub_79,call_function,sub.Tensor,backward,1,1,1,1,214,209,4 -6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8 -6940,add_324,call_function,add.Tensor,backward,1,1,1,1,216,207,4 -6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8 -6942,convert_element_type_1865,call_function,convert_element_type.default,backward,1,1,1,1,5922,205,6 -6943,alias_default_1218,call_function,alias.default,backward,1,1,1,2,5923,204,4 -6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5 -6945,permute_1155,call_function,permute.default,backward,1,1,1,1,4,200,3 -6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5 -6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10 -6948,permute_1156,call_function,permute.default,backward,1,1,1,1,5925,2,4 -6949,dtype_cast_493,call_function,dtype_cast.default,backward,1,1,1,1,5926,1,4 -6950,alias_default_1260,call_function,alias.default,backward,1,1,1,0,5927,0,3 -6951,convert_element_type_1870,call_function,convert_element_type.default,backward,1,1,1,1,5931,197,8 -6952,convert_element_type_1871,call_function,convert_element_type.default,backward,1,1,1,1,186,197,4 -6953,convert_element_type_1872,call_function,convert_element_type.default,backward,1,1,1,1,3,191,2 -6954,alias_default_1219,call_function,alias.default,backward,1,1,1,2,5932,196,4 -6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8 -6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8 -6957,alias_default_1220,call_function,alias.default,backward,1,1,1,2,5935,189,4 -6958,alias_default_1221,call_function,alias.default,backward,1,1,1,3,195,195,4 -6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8 -6960,sum_107,call_function,sum.dim_IntList,backward,1,1,1,1,5940,187,5 -6961,div_81,call_function,div.Tensor,backward,1,1,1,1,196,187,6 -6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8 -6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10 -6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8 -6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8 -6966,sum_108,call_function,sum.dim_IntList,backward,1,1,1,1,5937,3,5 -6967,convert_element_type_1873,call_function,convert_element_type.default,backward,1,1,1,1,5945,183,6 -6968,convert_element_type_1874,call_function,convert_element_type.default,backward,1,1,1,1,5938,2,3 -6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10 -6970,dtype_cast_494,call_function,dtype_cast.default,backward,1,1,1,1,5939,1,3 -6971,alias_default_1264,call_function,alias.default,backward,1,1,1,0,5940,0,2 -6972,alias_default_1222,call_function,alias.default,unknown,,1,1,3,5947,181,4 -6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5 -6974,permute_1159,call_function,permute.default,backward,1,1,1,1,4,177,3 -6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5 -6976,permute_1160,call_function,permute.default,backward,1,1,1,1,5949,2,4 -6977,dtype_cast_495,call_function,dtype_cast.default,backward,1,1,1,1,5950,1,4 -6978,alias_default_1259,call_function,alias.default,backward,1,1,1,0,5951,0,3 -6979,view_1228,call_function,view.default,backward,1,1,1,1,5950,175,4 -6980,permute_1161,call_function,permute.default,backward,1,1,1,1,5951,174,4 -6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2 -6982,getitem_330,call_function,getitem,backward,1,1,1,1,5956,146,2 -6983,getitem_331,call_function,getitem,backward,1,1,1,1,5956,147,2 -6984,getitem_332,call_function,getitem,backward,1,1,1,1,5956,140,2 -6985,permute_1162,call_function,permute.default,backward,1,1,1,1,5957,139,2 -6986,permute_1163,call_function,permute.default,backward,1,1,1,1,5957,146,2 -6987,permute_1164,call_function,permute.default,backward,1,1,1,1,5957,145,2 -6988,convert_element_type_1879,call_function,convert_element_type.default,backward,1,1,1,1,5958,145,2 -6989,convert_element_type_1880,call_function,convert_element_type.default,backward,1,1,1,1,5958,144,2 -6990,view_1229,call_function,view.default,backward,1,1,1,1,5959,144,2 -6991,view_as_complex_108,call_function,view_as_complex.default,backward,1,1,1,1,5960,143,6 -6992,_conj_52,call_function,_conj.default,backward,1,1,1,1,4,144,3 -6993,clone_214,call_function,clone.default,backward,1,1,1,1,5,143,3 -6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8 -6995,view_1230,call_function,view.default,backward,1,1,1,1,5959,143,2 -6996,view_as_complex_109,call_function,view_as_complex.default,backward,1,1,1,1,5960,142,6 -6997,_conj_53,call_function,_conj.default,backward,1,1,1,1,4,143,3 -6998,clone_215,call_function,clone.default,backward,1,1,1,1,5,142,3 -6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8 -7000,view_as_real_108,call_function,view_as_real.default,backward,1,1,1,1,5964,141,6 -7001,view_1231,call_function,view.default,backward,1,1,1,1,5965,140,6 -7002,convert_element_type_1881,call_function,convert_element_type.default,backward,1,1,1,1,5966,139,6 -7003,view_as_real_109,call_function,view_as_real.default,backward,1,1,1,1,5964,140,6 -7004,view_1232,call_function,view.default,backward,1,1,1,1,5965,139,6 -7005,convert_element_type_1882,call_function,convert_element_type.default,backward,1,1,1,1,5966,138,6 -7006,view_1233,call_function,view.default,backward,1,1,1,1,5958,138,2 -7007,view_1234,call_function,view.default,backward,1,1,1,1,5967,138,5 -7008,view_1235,call_function,view.default,backward,1,1,1,1,5967,137,5 -7009,alias_default_1223,call_function,alias.default,backward,1,1,1,2,5959,137,4 -7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5 -7011,permute_1167,call_function,permute.default,backward,1,1,1,1,4,133,3 -7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5 -7013,permute_1168,call_function,permute.default,backward,1,1,1,1,5961,2,4 -7014,dtype_cast_496,call_function,dtype_cast.default,backward,1,1,1,1,5962,1,4 -7015,alias_default_1258,call_function,alias.default,backward,1,1,1,0,5963,0,3 -7016,alias_default_1224,call_function,alias.default,backward,1,1,1,2,5968,137,4 -7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5 -7018,permute_1171,call_function,permute.default,backward,1,1,1,1,4,133,3 -7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5 -7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10 -7021,permute_1172,call_function,permute.default,backward,1,1,1,1,5970,2,4 -7022,dtype_cast_497,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4 -7023,alias_default_1257,call_function,alias.default,backward,1,1,1,0,5972,0,3 -7024,alias_default_1225,call_function,alias.default,backward,1,1,1,2,5968,136,4 -7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5 -7026,permute_1175,call_function,permute.default,backward,1,1,1,1,4,132,3 -7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5 -7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10 -7029,permute_1176,call_function,permute.default,backward,1,1,1,1,5970,2,4 -7030,dtype_cast_498,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4 -7031,alias_default_1256,call_function,alias.default,backward,1,1,1,0,5972,0,3 -7032,convert_element_type_1895,call_function,convert_element_type.default,backward,1,1,1,1,5994,129,8 -7033,convert_element_type_1896,call_function,convert_element_type.default,backward,1,1,1,1,119,129,4 -7034,convert_element_type_1897,call_function,convert_element_type.default,backward,1,1,1,1,3,123,2 -7035,alias_default_1226,call_function,alias.default,backward,1,1,1,2,5995,128,4 -7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8 -7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8 -7038,alias_default_1227,call_function,alias.default,backward,1,1,1,2,5998,121,4 -7039,alias_default_1228,call_function,alias.default,backward,1,1,1,3,128,127,4 -7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8 -7041,sum_109,call_function,sum.dim_IntList,backward,1,1,1,1,6003,119,5 -7042,div_82,call_function,div.Tensor,backward,1,1,1,1,129,119,6 -7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8 -7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10 -7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8 -7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8 -7047,sum_110,call_function,sum.dim_IntList,backward,1,1,1,1,6000,3,5 -7048,convert_element_type_1898,call_function,convert_element_type.default,backward,1,1,1,1,6008,115,6 -7049,convert_element_type_1899,call_function,convert_element_type.default,backward,1,1,1,1,6001,2,3 -7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10 -7051,dtype_cast_499,call_function,dtype_cast.default,backward,1,1,1,1,6002,1,3 -7052,alias_default_1263,call_function,alias.default,backward,1,1,1,0,6003,0,2 -7053,alias_default_1229,call_function,alias.default,unknown,,1,1,3,6010,113,4 -7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5 -7055,permute_1179,call_function,permute.default,backward,0,1,1,1,4,109,3 -7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5 -7057,permute_1180,call_function,permute.default,backward,0,1,1,1,6012,2,4 -7058,dtype_cast_500,call_function,dtype_cast.default,backward,0,1,1,1,6013,1,4 -7059,alias_default_1252,call_function,alias.default,backward,0,1,1,0,6014,0,3 -7060,alias_default_1230,call_function,alias.default,backward,0,1,1,2,6013,107,4 -7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8 -7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8 -7063,alias_default_1231,call_function,alias.default,backward,0,1,1,2,6015,94,4 -7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5 -7065,permute_1183,call_function,permute.default,backward,0,1,1,1,4,90,3 -7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5 -7067,permute_1184,call_function,permute.default,backward,0,1,1,1,6017,2,4 -7068,dtype_cast_501,call_function,dtype_cast.default,backward,0,1,1,1,6018,1,4 -7069,alias_default_1253,call_function,alias.default,backward,0,1,1,0,6019,0,3 -7070,convert_element_type_1908,call_function,convert_element_type.default,backward,0,1,1,1,6015,98,6 -7071,convert_element_type_1909,call_function,convert_element_type.default,backward,0,1,1,1,96,108,4 -7072,alias_default_1232,call_function,alias.default,backward,0,1,1,2,97,107,4 -7073,neg_55,call_function,neg.default,backward,0,1,1,1,98,106,8 -7074,exp_55,call_function,exp.default,backward,0,1,1,1,99,105,6 -7075,add_330,call_function,add.Tensor,backward,0,1,1,1,100,104,4 -7076,reciprocal_27,call_function,reciprocal.default,backward,0,1,1,1,101,103,4 -7077,mul_746,call_function,mul.Tensor,backward,0,1,1,1,102,102,6 -7078,alias_default_1233,call_function,alias.default,backward,0,1,1,2,103,101,4 -7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8 -7080,sub_82,call_function,sub.Tensor,backward,0,1,1,1,104,99,4 -7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8 -7082,add_331,call_function,add.Tensor,backward,0,1,1,1,106,97,4 -7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8 -7084,convert_element_type_1910,call_function,convert_element_type.default,backward,0,1,1,1,6029,95,6 -7085,alias_default_1234,call_function,alias.default,backward,0,1,1,2,6030,94,4 -7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5 -7087,permute_1187,call_function,permute.default,backward,0,1,1,1,4,90,3 -7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5 -7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10 -7090,permute_1188,call_function,permute.default,backward,0,1,1,1,6032,2,4 -7091,dtype_cast_502,call_function,dtype_cast.default,backward,0,1,1,1,6033,1,4 -7092,alias_default_1251,call_function,alias.default,backward,0,1,1,0,6034,0,3 -7093,convert_element_type_1915,call_function,convert_element_type.default,backward,0,1,1,1,6038,87,8 -7094,convert_element_type_1916,call_function,convert_element_type.default,backward,0,1,1,1,76,87,4 -7095,convert_element_type_1917,call_function,convert_element_type.default,backward,0,1,1,1,3,81,2 -7096,alias_default_1235,call_function,alias.default,backward,0,1,1,2,6039,86,4 -7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8 -7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8 -7099,alias_default_1236,call_function,alias.default,backward,0,1,1,2,6042,79,4 -7100,alias_default_1237,call_function,alias.default,backward,0,1,1,3,85,85,4 -7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8 -7102,sum_111,call_function,sum.dim_IntList,backward,0,1,1,1,6047,77,5 -7103,div_83,call_function,div.Tensor,backward,0,1,1,1,86,77,6 -7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8 -7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10 -7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8 -7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8 -7108,sum_112,call_function,sum.dim_IntList,backward,0,1,1,1,6044,3,5 -7109,convert_element_type_1918,call_function,convert_element_type.default,backward,0,1,1,1,6052,73,6 -7110,convert_element_type_1919,call_function,convert_element_type.default,backward,0,1,1,1,6045,2,3 -7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10 -7112,dtype_cast_503,call_function,dtype_cast.default,backward,0,1,1,1,6046,1,3 -7113,alias_default_1255,call_function,alias.default,backward,0,1,1,0,6047,0,2 -7114,alias_default_1238,call_function,alias.default,unknown,,1,1,3,6054,71,4 -7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5 -7116,permute_1191,call_function,permute.default,backward,0,1,1,1,4,67,3 -7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5 -7118,permute_1192,call_function,permute.default,backward,0,1,1,1,6056,2,4 -7119,dtype_cast_504,call_function,dtype_cast.default,backward,0,1,1,1,6057,1,4 -7120,alias_default_1250,call_function,alias.default,backward,0,1,1,0,6058,0,3 -7121,view_1250,call_function,view.default,backward,0,1,1,1,6057,65,4 -7122,permute_1193,call_function,permute.default,backward,0,1,1,1,6058,64,4 -7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2 -7124,getitem_333,call_function,getitem,backward,0,1,1,1,6063,36,2 -7125,getitem_334,call_function,getitem,backward,0,1,1,1,6063,37,2 -7126,getitem_335,call_function,getitem,backward,0,1,1,1,6063,30,2 -7127,permute_1194,call_function,permute.default,backward,0,1,1,1,6064,29,2 -7128,permute_1195,call_function,permute.default,backward,0,1,1,1,6064,36,2 -7129,permute_1196,call_function,permute.default,backward,0,1,1,1,6064,35,2 -7130,convert_element_type_1924,call_function,convert_element_type.default,backward,0,1,1,1,6065,35,2 -7131,convert_element_type_1925,call_function,convert_element_type.default,backward,0,1,1,1,6065,34,2 -7132,view_1251,call_function,view.default,backward,0,1,1,1,6066,34,2 -7133,view_as_complex_110,call_function,view_as_complex.default,backward,0,1,1,1,6067,33,6 -7134,_conj_54,call_function,_conj.default,backward,0,1,1,1,4,34,3 -7135,clone_222,call_function,clone.default,backward,0,1,1,1,5,33,3 -7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8 -7137,view_1252,call_function,view.default,backward,0,1,1,1,6066,33,2 -7138,view_as_complex_111,call_function,view_as_complex.default,backward,0,1,1,1,6067,32,6 -7139,_conj_55,call_function,_conj.default,backward,0,1,1,1,4,33,3 -7140,clone_223,call_function,clone.default,backward,0,1,1,1,5,32,3 -7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8 -7142,view_as_real_110,call_function,view_as_real.default,backward,0,1,1,1,6071,31,6 -7143,view_1253,call_function,view.default,backward,0,1,1,1,6072,30,6 -7144,convert_element_type_1926,call_function,convert_element_type.default,backward,0,1,1,1,6073,29,6 -7145,view_as_real_111,call_function,view_as_real.default,backward,0,1,1,1,6071,30,6 -7146,view_1254,call_function,view.default,backward,0,1,1,1,6072,29,6 -7147,convert_element_type_1927,call_function,convert_element_type.default,backward,0,1,1,1,6073,28,6 -7148,view_1255,call_function,view.default,backward,0,1,1,1,6065,28,2 -7149,view_1256,call_function,view.default,backward,0,1,1,1,6074,28,5 -7150,view_1257,call_function,view.default,backward,0,1,1,1,6074,27,5 -7151,alias_default_1239,call_function,alias.default,backward,0,1,1,2,6066,27,4 -7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5 -7153,permute_1199,call_function,permute.default,backward,0,1,1,1,4,23,3 -7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5 -7155,permute_1200,call_function,permute.default,backward,0,1,1,1,6068,2,4 -7156,dtype_cast_505,call_function,dtype_cast.default,backward,0,1,1,1,6069,1,4 -7157,alias_default_1249,call_function,alias.default,backward,0,1,1,0,6070,0,3 -7158,alias_default_1240,call_function,alias.default,backward,0,1,1,2,6075,27,4 -7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5 -7160,permute_1203,call_function,permute.default,backward,0,1,1,1,4,23,3 -7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5 -7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10 -7163,permute_1204,call_function,permute.default,backward,0,1,1,1,6077,2,4 -7164,dtype_cast_506,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4 -7165,alias_default_1248,call_function,alias.default,backward,0,1,1,0,6079,0,3 -7166,alias_default_1241,call_function,alias.default,backward,0,1,1,2,6075,26,4 -7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5 -7168,permute_1207,call_function,permute.default,backward,0,1,1,1,4,22,3 -7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5 -7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10 -7171,permute_1208,call_function,permute.default,backward,0,1,1,1,6077,2,4 -7172,dtype_cast_507,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4 -7173,alias_default_1247,call_function,alias.default,backward,0,1,1,0,6079,0,3 -7174,convert_element_type_1940,call_function,convert_element_type.default,backward,0,1,1,1,6101,19,8 -7175,convert_element_type_1941,call_function,convert_element_type.default,backward,0,1,1,1,7,19,4 -7176,convert_element_type_1942,call_function,convert_element_type.default,backward,0,1,1,1,3,13,2 -7177,alias_default_1242,call_function,alias.default,backward,0,1,1,2,6102,18,4 -7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8 -7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8 -7180,alias_default_1243,call_function,alias.default,backward,0,1,1,2,6105,11,4 -7181,alias_default_1244,call_function,alias.default,backward,0,1,1,3,16,17,4 -7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8 -7183,sum_113,call_function,sum.dim_IntList,backward,0,1,1,1,6110,9,5 -7184,div_84,call_function,div.Tensor,backward,0,1,1,1,17,9,6 -7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8 -7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10 -7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8 -7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8 -7189,sum_114,call_function,sum.dim_IntList,backward,0,1,1,1,6107,3,5 -7190,convert_element_type_1943,call_function,convert_element_type.default,backward,0,1,1,1,6115,5,6 -7191,convert_element_type_1944,call_function,convert_element_type.default,backward,0,1,1,1,6108,2,3 -7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10 -7193,dtype_cast_508,call_function,dtype_cast.default,backward,0,1,1,1,6109,1,3 -7194,alias_default_1254,call_function,alias.default,backward,0,1,1,0,6110,0,2 -7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5 -7196,dtype_cast_509,call_function,dtype_cast.default,backward,,1,1,1,6118,2,3 -7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9 -7198,alias_default_1246,call_function,alias.default,unknown,,1,1,0,6127,0,3 diff --git a/profile_results/real_llama3_3b_dag_summary.json b/profile_results/real_llama3_3b_dag_summary.json deleted file mode 100644 index 93434ea9..00000000 --- a/profile_results/real_llama3_3b_dag_summary.json +++ /dev/null @@ -1,883 +0,0 @@ -{ - "branch_points": 1301, - "dag_edges": 8805, - "direct_dependency_histogram": { - "0": 257, - "1": 5275, - "2": 1611, - "3": 28, - "8": 28 - }, - "direct_offspring_histogram": { - "0": 255, - "1": 5643, - "2": 934, - "3": 254, - "4": 84, - "6": 28, - "28": 1 - }, - "ilp_nodes": 7199, - "max_ancestor_count": 6127, - "max_descendant_count": 5943, - "max_direct_dependency_nodes": 8, - "max_direct_offspring_nodes": 28, - "merge_points": 1667, - "merge_points_csv": "profile_results/real_llama3_3b_merge_points.csv", - "mesh": "1D 64", - "model": "LLaMA3 3B", - "node_stats_csv": "profile_results/real_llama3_3b_dag_node_stats.csv", - "top_fanout_points": [ - { - "ancestor_count": 1, - "descendant_count": 5942, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 28, - "idx": 296, - "layer": "", - "name": "alias_default_1", - "op": "call_function", - "phase": "unknown", - "strategy_count": 3, - "target": "alias.default" - }, - { - "ancestor_count": 20, - "descendant_count": 5788, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 276, - "layer": 0, - "name": "alias_default_8", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 132, - "descendant_count": 5692, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 381, - "layer": 1, - "name": "alias_default_36", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 242, - "descendant_count": 5596, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 485, - "layer": 2, - "name": "alias_default_64", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 352, - "descendant_count": 5500, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 589, - "layer": 3, - "name": "alias_default_92", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 462, - "descendant_count": 5404, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 693, - "layer": 4, - "name": "alias_default_120", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 572, - "descendant_count": 5308, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 797, - "layer": 5, - "name": "alias_default_148", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 682, - "descendant_count": 5212, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 901, - "layer": 6, - "name": "alias_default_176", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 792, - "descendant_count": 5116, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1005, - "layer": 7, - "name": "alias_default_204", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 902, - "descendant_count": 5020, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1109, - "layer": 8, - "name": "alias_default_232", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1012, - "descendant_count": 4924, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1213, - "layer": 9, - "name": "alias_default_260", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1122, - "descendant_count": 4828, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1317, - "layer": 10, - "name": "alias_default_288", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1232, - "descendant_count": 4732, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1421, - "layer": 11, - "name": "alias_default_316", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1342, - "descendant_count": 4636, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1525, - "layer": 12, - "name": "alias_default_344", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1452, - "descendant_count": 4540, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1629, - "layer": 13, - "name": "alias_default_372", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1562, - "descendant_count": 4444, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1733, - "layer": 14, - "name": "alias_default_400", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1672, - "descendant_count": 4348, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1837, - "layer": 15, - "name": "alias_default_428", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1782, - "descendant_count": 4252, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 1941, - "layer": 16, - "name": "alias_default_456", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 1892, - "descendant_count": 4156, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2045, - "layer": 17, - "name": "alias_default_484", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2002, - "descendant_count": 4060, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2149, - "layer": 18, - "name": "alias_default_512", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2112, - "descendant_count": 3964, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2253, - "layer": 19, - "name": "alias_default_540", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2222, - "descendant_count": 3868, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2357, - "layer": 20, - "name": "alias_default_568", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2332, - "descendant_count": 3772, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2461, - "layer": 21, - "name": "alias_default_596", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2442, - "descendant_count": 3676, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2565, - "layer": 22, - "name": "alias_default_624", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2552, - "descendant_count": 3580, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2669, - "layer": 23, - "name": "alias_default_652", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2662, - "descendant_count": 3484, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2773, - "layer": 24, - "name": "alias_default_680", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2772, - "descendant_count": 3388, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2877, - "layer": 25, - "name": "alias_default_708", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2882, - "descendant_count": 3292, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 2981, - "layer": 26, - "name": "alias_default_736", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 2992, - "descendant_count": 3196, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 6, - "idx": 3085, - "layer": 27, - "name": "alias_default_764", - "op": "call_function", - "phase": "forward", - "strategy_count": 4, - "target": "alias.default" - }, - { - "ancestor_count": 3, - "descendant_count": 5778, - "direct_dependency_args": 1, - "direct_dependency_nodes": 1, - "direct_offspring_nodes": 4, - "idx": 298, - "layer": 0, - "name": "alias_default_12", - "op": "call_function", - "phase": "forward", - "strategy_count": 3, - "target": "alias.default" - } - ], - "top_merge_points": [ - { - "ancestor_count": 3173, - "descendant_count": 3033, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 3289, - "layer": 27, - "name": "_scaled_dot_product_flash_attention_backward", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3280, - "descendant_count": 2923, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 3431, - "layer": 26, - "name": "_scaled_dot_product_flash_attention_backward_1", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3387, - "descendant_count": 2813, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 3573, - "layer": 25, - "name": "_scaled_dot_product_flash_attention_backward_2", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3494, - "descendant_count": 2703, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 3715, - "layer": 24, - "name": "_scaled_dot_product_flash_attention_backward_3", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3601, - "descendant_count": 2593, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 3857, - "layer": 23, - "name": "_scaled_dot_product_flash_attention_backward_4", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3708, - "descendant_count": 2483, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 3999, - "layer": 22, - "name": "_scaled_dot_product_flash_attention_backward_5", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3815, - "descendant_count": 2373, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4141, - "layer": 21, - "name": "_scaled_dot_product_flash_attention_backward_6", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 3922, - "descendant_count": 2263, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4283, - "layer": 20, - "name": "_scaled_dot_product_flash_attention_backward_7", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4029, - "descendant_count": 2153, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4425, - "layer": 19, - "name": "_scaled_dot_product_flash_attention_backward_8", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4136, - "descendant_count": 2043, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4567, - "layer": 18, - "name": "_scaled_dot_product_flash_attention_backward_9", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4243, - "descendant_count": 1933, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4709, - "layer": 17, - "name": "_scaled_dot_product_flash_attention_backward_10", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4350, - "descendant_count": 1823, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4851, - "layer": 16, - "name": "_scaled_dot_product_flash_attention_backward_11", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4457, - "descendant_count": 1713, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 4993, - "layer": 15, - "name": "_scaled_dot_product_flash_attention_backward_12", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4564, - "descendant_count": 1603, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5135, - "layer": 14, - "name": "_scaled_dot_product_flash_attention_backward_13", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4671, - "descendant_count": 1493, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5277, - "layer": 13, - "name": "_scaled_dot_product_flash_attention_backward_14", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4778, - "descendant_count": 1383, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5419, - "layer": 12, - "name": "_scaled_dot_product_flash_attention_backward_15", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4885, - "descendant_count": 1273, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5561, - "layer": 11, - "name": "_scaled_dot_product_flash_attention_backward_16", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 4992, - "descendant_count": 1163, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5703, - "layer": 10, - "name": "_scaled_dot_product_flash_attention_backward_17", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5099, - "descendant_count": 1053, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5845, - "layer": 9, - "name": "_scaled_dot_product_flash_attention_backward_18", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5206, - "descendant_count": 943, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 5987, - "layer": 8, - "name": "_scaled_dot_product_flash_attention_backward_19", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5313, - "descendant_count": 833, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6129, - "layer": 7, - "name": "_scaled_dot_product_flash_attention_backward_20", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5420, - "descendant_count": 723, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6271, - "layer": 6, - "name": "_scaled_dot_product_flash_attention_backward_21", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5527, - "descendant_count": 613, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6413, - "layer": 5, - "name": "_scaled_dot_product_flash_attention_backward_22", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5634, - "descendant_count": 503, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6555, - "layer": 4, - "name": "_scaled_dot_product_flash_attention_backward_23", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5741, - "descendant_count": 393, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6697, - "layer": 3, - "name": "_scaled_dot_product_flash_attention_backward_24", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5848, - "descendant_count": 283, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6839, - "layer": 2, - "name": "_scaled_dot_product_flash_attention_backward_25", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 5955, - "descendant_count": 173, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 6981, - "layer": 1, - "name": "_scaled_dot_product_flash_attention_backward_26", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 6062, - "descendant_count": 63, - "direct_dependency_args": 8, - "direct_dependency_nodes": 8, - "direct_offspring_nodes": 3, - "idx": 7123, - "layer": 0, - "name": "_scaled_dot_product_flash_attention_backward_27", - "op": "call_function", - "phase": "backward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention_backward.default" - }, - { - "ancestor_count": 63, - "descendant_count": 5761, - "direct_dependency_args": 3, - "direct_dependency_nodes": 3, - "direct_offspring_nodes": 4, - "idx": 313, - "layer": 0, - "name": "_scaled_dot_product_flash_attention", - "op": "call_function", - "phase": "forward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention.default" - }, - { - "ancestor_count": 173, - "descendant_count": 5665, - "direct_dependency_args": 3, - "direct_dependency_nodes": 3, - "direct_offspring_nodes": 4, - "idx": 417, - "layer": 1, - "name": "_scaled_dot_product_flash_attention_1", - "op": "call_function", - "phase": "forward", - "strategy_count": 2, - "target": "_scaled_dot_product_flash_attention.default" - } - ], - "trace_and_optimizer_build_s": 38.44014171184972, - "treewidth_upper_bounds": { - "moralized_edges": 11200, - "moralized_min_degree": 10, - "moralized_min_fill": 8, - "undirected_edges": 8805, - "undirected_min_degree": 9, - "undirected_min_fill": 6 - } -} \ No newline at end of file diff --git a/profile_results/real_llama3_3b_merge_points.csv b/profile_results/real_llama3_3b_merge_points.csv deleted file mode 100644 index 4418765e..00000000 --- a/profile_results/real_llama3_3b_merge_points.csv +++ /dev/null @@ -1,1668 +0,0 @@ -idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count -3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2 -3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2 -3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2 -3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2 -3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2 -3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2 -4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2 -4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2 -4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2 -4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2 -4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2 -4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2 -4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2 -5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2 -5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2 -5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2 -5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2 -5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2 -5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2 -5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2 -6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2 -6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2 -6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2 -6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2 -6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2 -6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2 -6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2 -7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2 -313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2 -417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2 -521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2 -625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2 -729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2 -833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2 -937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2 -1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2 -1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2 -1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2 -1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2 -1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2 -1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2 -1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2 -1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2 -1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2 -1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2 -2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2 -2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2 -2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2 -2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2 -2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2 -2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2 -2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2 -2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2 -2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2 -3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2 -3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2 -260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5 -270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8 -272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8 -278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5 -282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5 -299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 -302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8 -286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5 -325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5 -326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10 -336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8 -338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8 -344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5 -351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6 -356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5 -359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8 -364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5 -365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10 -375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8 -377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8 -383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5 -387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5 -403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 -406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8 -391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5 -429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5 -430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10 -440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8 -442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8 -448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5 -455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6 -460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5 -463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8 -468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5 -469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10 -479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8 -481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8 -487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5 -491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5 -507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 -510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8 -495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5 -533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5 -534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10 -544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8 -546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8 -552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5 -559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6 -564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5 -567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8 -572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5 -573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10 -583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8 -585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8 -591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5 -595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5 -611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 -614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8 -599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5 -637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5 -638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10 -648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8 -650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8 -656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5 -663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6 -668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5 -671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8 -676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5 -677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10 -687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8 -689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8 -695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5 -699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5 -715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 -718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8 -703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5 -741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5 -742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10 -752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8 -754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8 -760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5 -767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6 -772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5 -775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8 -780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5 -781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10 -791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8 -793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8 -799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5 -803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5 -819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 -822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8 -807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5 -845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5 -846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10 -856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8 -858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8 -864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5 -871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6 -876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5 -879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8 -884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5 -885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10 -895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8 -897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8 -903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5 -907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5 -923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 -926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8 -911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5 -949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5 -950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10 -960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8 -962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8 -968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5 -975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6 -980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5 -983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8 -988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5 -989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10 -999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8 -1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8 -1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5 -1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5 -1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 -1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8 -1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5 -1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5 -1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10 -1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8 -1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8 -1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5 -1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6 -1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5 -1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8 -1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5 -1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10 -1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8 -1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8 -1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5 -1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5 -1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 -1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8 -1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5 -1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5 -1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10 -1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8 -1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8 -1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5 -1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6 -1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5 -1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8 -1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5 -1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10 -1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8 -1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8 -1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 -1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5 -1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 -1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8 -1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5 -1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5 -1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10 -1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8 -1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8 -1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5 -1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6 -1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5 -1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8 -1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5 -1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10 -1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8 -1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8 -1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 -1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5 -1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 -1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8 -1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5 -1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5 -1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10 -1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8 -1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8 -1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5 -1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6 -1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5 -1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8 -1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5 -1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10 -1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8 -1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8 -1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 -1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5 -1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 -1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8 -1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5 -1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5 -1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10 -1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8 -1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8 -1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5 -1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6 -1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5 -1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8 -1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5 -1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10 -1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8 -1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8 -1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 -1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5 -1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 -1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8 -1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5 -1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5 -1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10 -1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8 -1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8 -1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5 -1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6 -1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5 -1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8 -1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5 -1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10 -1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8 -1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8 -1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 -1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5 -1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 -1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8 -1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5 -1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5 -1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10 -1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8 -1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8 -1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5 -1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6 -1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5 -1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8 -1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5 -1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10 -1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8 -1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8 -1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 -1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5 -1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 -1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8 -1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5 -1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5 -1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10 -1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8 -1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8 -1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5 -1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6 -1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5 -1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8 -1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5 -1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10 -1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8 -1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8 -1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 -1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5 -1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 -1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8 -1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5 -1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5 -1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10 -1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8 -1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8 -1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5 -1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6 -1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5 -1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8 -1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5 -1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10 -1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8 -1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8 -1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 -1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5 -1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 -1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8 -1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5 -1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5 -1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10 -2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8 -2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8 -2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5 -2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6 -2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5 -2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8 -2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5 -2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10 -2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8 -2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8 -2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 -2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5 -2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 -2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8 -2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5 -2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5 -2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10 -2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8 -2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8 -2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5 -2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6 -2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5 -2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8 -2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5 -2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10 -2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8 -2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8 -2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 -2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5 -2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 -2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8 -2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5 -2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5 -2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10 -2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8 -2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8 -2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5 -2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6 -2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5 -2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8 -2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5 -2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10 -2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8 -2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8 -2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 -2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5 -2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 -2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8 -2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5 -2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5 -2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10 -2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8 -2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8 -2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5 -2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6 -2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5 -2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8 -2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5 -2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10 -2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8 -2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8 -2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 -2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5 -2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 -2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8 -2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5 -2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5 -2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10 -2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8 -2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8 -2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5 -2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6 -2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5 -2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8 -2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5 -2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10 -2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8 -2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8 -2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 -2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5 -2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 -2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8 -2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5 -2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5 -2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10 -2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8 -2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8 -2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5 -2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6 -2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5 -2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8 -2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5 -2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10 -2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8 -2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8 -2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 -2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5 -2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 -2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8 -2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5 -2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5 -2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10 -2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8 -2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8 -2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5 -2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6 -2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5 -2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8 -2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5 -2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10 -2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8 -2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8 -2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 -2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5 -2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 -2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8 -2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5 -2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5 -2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10 -2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8 -2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8 -2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5 -2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6 -2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5 -2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8 -2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5 -2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10 -2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8 -2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8 -2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 -2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5 -2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 -2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8 -2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5 -2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5 -2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10 -2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8 -2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8 -2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5 -2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6 -2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5 -2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8 -2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5 -2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10 -2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8 -2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8 -2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 -2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5 -2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 -2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8 -2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5 -2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5 -2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10 -2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8 -2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8 -2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5 -2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6 -2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5 -2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8 -2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5 -2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10 -2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8 -2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8 -2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 -2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5 -3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 -3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8 -2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5 -3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5 -3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10 -3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8 -3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8 -3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5 -3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6 -3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5 -3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8 -3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5 -3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10 -3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8 -3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8 -3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 -3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5 -3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 -3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8 -3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5 -3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5 -3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10 -3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8 -3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8 -3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5 -3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6 -3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5 -3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8 -3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5 -3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10 -3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5 -3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8 -3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8 -3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8 -3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8 -3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10 -3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8 -3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5 -3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8 -3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8 -3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8 -3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8 -3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8 -3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5 -3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5 -3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10 -3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8 -3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8 -3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8 -3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8 -3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10 -3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8 -3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10 -3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5 -3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8 -3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8 -3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5 -3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5 -3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10 -3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5 -3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10 -3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8 -3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8 -3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8 -3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8 -3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10 -3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8 -3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10 -3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5 -3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8 -3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8 -3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8 -3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8 -3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8 -3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5 -3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5 -3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10 -3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8 -3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8 -3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8 -3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8 -3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10 -3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8 -3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10 -3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5 -3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8 -3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8 -3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5 -3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5 -3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10 -3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5 -3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10 -3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8 -3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8 -3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8 -3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8 -3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10 -3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8 -3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10 -3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5 -3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8 -3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8 -3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8 -3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8 -3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8 -3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5 -3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5 -3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10 -3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8 -3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8 -3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8 -3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8 -3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10 -3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8 -3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10 -3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5 -3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8 -3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8 -3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5 -3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5 -3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10 -3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5 -3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10 -3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8 -3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8 -3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8 -3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8 -3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10 -3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8 -3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10 -3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5 -3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8 -3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8 -3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8 -3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8 -3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8 -3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5 -3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5 -3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10 -3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8 -3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8 -3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8 -3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8 -3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10 -3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8 -3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10 -3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5 -3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8 -3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8 -3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5 -3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5 -3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10 -3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5 -3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10 -3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8 -3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8 -3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8 -3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8 -3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10 -3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8 -3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10 -3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5 -3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8 -3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8 -3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8 -3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8 -3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8 -3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5 -3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5 -3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10 -3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8 -3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8 -3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8 -3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8 -3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10 -3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8 -3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10 -3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5 -3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8 -3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8 -3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5 -3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5 -3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10 -3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5 -3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10 -3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8 -3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8 -3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8 -3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8 -3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10 -3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8 -3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10 -3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5 -3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8 -3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8 -3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8 -3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8 -3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8 -3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5 -3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5 -3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10 -3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8 -3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8 -3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8 -3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8 -3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10 -3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8 -3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10 -3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5 -4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8 -4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8 -4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5 -4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5 -4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10 -4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5 -4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10 -4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8 -4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8 -4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8 -4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8 -4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10 -4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8 -4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10 -4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5 -4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8 -4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8 -4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8 -4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8 -4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8 -4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5 -4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5 -4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10 -4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8 -4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8 -4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8 -4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8 -4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10 -4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8 -4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10 -4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5 -4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8 -4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8 -4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5 -4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5 -4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10 -4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5 -4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10 -4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8 -4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8 -4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8 -4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8 -4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10 -4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8 -4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10 -4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5 -4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8 -4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8 -4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8 -4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8 -4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8 -4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5 -4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5 -4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10 -4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8 -4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8 -4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8 -4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8 -4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10 -4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8 -4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10 -4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5 -4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8 -4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8 -4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5 -4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5 -4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10 -4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5 -4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10 -4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8 -4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8 -4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8 -4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8 -4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10 -4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8 -4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10 -4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5 -4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8 -4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8 -4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8 -4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8 -4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8 -4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5 -4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5 -4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10 -4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8 -4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8 -4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8 -4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8 -4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10 -4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8 -4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10 -4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5 -4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8 -4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8 -4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5 -4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5 -4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10 -4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5 -4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10 -4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8 -4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8 -4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8 -4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8 -4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10 -4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8 -4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10 -4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5 -4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8 -4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8 -4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8 -4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8 -4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8 -4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5 -4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5 -4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10 -4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8 -4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8 -4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8 -4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8 -4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10 -4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8 -4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10 -4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5 -4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8 -4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8 -4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5 -4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5 -4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10 -4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5 -4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10 -4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8 -4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8 -4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8 -4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8 -4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10 -4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8 -4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10 -4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5 -4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8 -4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8 -4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8 -4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8 -4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8 -4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5 -4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5 -4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10 -4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8 -4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8 -4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8 -4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8 -4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10 -4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8 -4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10 -4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5 -4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8 -4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8 -4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5 -4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5 -4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10 -4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5 -4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10 -4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8 -4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8 -4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8 -4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8 -4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10 -4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8 -4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10 -4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5 -4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8 -4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8 -4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8 -4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8 -4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8 -4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5 -4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5 -4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10 -4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8 -4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8 -4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8 -4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8 -4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10 -4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8 -4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10 -4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5 -4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8 -4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8 -4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5 -4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5 -4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10 -4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5 -4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10 -4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8 -4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8 -4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8 -4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8 -4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10 -4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8 -4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10 -4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5 -4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8 -4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8 -4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8 -4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8 -4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8 -4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5 -4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5 -4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10 -4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8 -4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8 -4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8 -4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8 -4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10 -4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8 -4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10 -4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5 -5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8 -5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8 -5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5 -5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5 -5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10 -5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5 -5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10 -5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8 -5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8 -5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8 -5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8 -5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10 -5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8 -5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10 -5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5 -5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8 -5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8 -5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8 -5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8 -5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8 -5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5 -5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5 -5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10 -5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8 -5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8 -5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8 -5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8 -5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10 -5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8 -5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10 -5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5 -5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8 -5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8 -5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5 -5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5 -5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10 -5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5 -5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10 -5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8 -5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8 -5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8 -5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8 -5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10 -5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8 -5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10 -5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5 -5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8 -5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8 -5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8 -5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8 -5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8 -5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5 -5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5 -5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10 -5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8 -5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8 -5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8 -5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8 -5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10 -5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8 -5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10 -5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5 -5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8 -5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8 -5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5 -5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5 -5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10 -5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5 -5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10 -5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8 -5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8 -5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8 -5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8 -5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10 -5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8 -5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10 -5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5 -5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8 -5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8 -5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8 -5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8 -5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8 -5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5 -5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5 -5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10 -5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8 -5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8 -5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8 -5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8 -5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10 -5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8 -5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10 -5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5 -5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8 -5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8 -5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5 -5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5 -5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10 -5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5 -5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10 -5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8 -5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8 -5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8 -5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8 -5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10 -5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8 -5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10 -5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5 -5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8 -5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8 -5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8 -5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8 -5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8 -5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5 -5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5 -5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10 -5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8 -5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8 -5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8 -5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8 -5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10 -5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8 -5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10 -5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5 -5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8 -5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8 -5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5 -5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5 -5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10 -5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5 -5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10 -5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8 -5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8 -5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8 -5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8 -5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10 -5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8 -5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10 -5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5 -5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8 -5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8 -5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8 -5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8 -5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8 -5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5 -5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5 -5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10 -5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8 -5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8 -5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8 -5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8 -5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10 -5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8 -5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10 -5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5 -5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8 -5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8 -5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5 -5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5 -5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10 -5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5 -5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10 -5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8 -5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8 -5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8 -5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8 -5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10 -5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8 -5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10 -5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5 -5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8 -5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8 -5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8 -5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8 -5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8 -5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5 -5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5 -5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10 -5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8 -5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8 -5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8 -5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8 -5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10 -5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8 -5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10 -5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5 -5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8 -5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8 -5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5 -5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5 -5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10 -5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5 -5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10 -5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8 -5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8 -5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8 -5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8 -5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10 -5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8 -5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10 -5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5 -5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8 -5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8 -5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8 -5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8 -5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8 -5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5 -5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5 -5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10 -5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8 -5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8 -5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8 -5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8 -5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10 -5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8 -5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10 -5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5 -6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8 -6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8 -6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5 -6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5 -6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10 -6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5 -6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10 -6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8 -6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8 -6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8 -6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8 -6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10 -6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8 -6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10 -6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5 -6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8 -6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8 -6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8 -6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8 -6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8 -6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5 -6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5 -6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10 -6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8 -6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8 -6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8 -6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8 -6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10 -6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8 -6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10 -6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5 -6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8 -6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8 -6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5 -6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5 -6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10 -6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5 -6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10 -6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8 -6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8 -6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8 -6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8 -6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10 -6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8 -6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10 -6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5 -6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8 -6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8 -6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8 -6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8 -6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8 -6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5 -6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5 -6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10 -6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8 -6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8 -6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8 -6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8 -6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10 -6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8 -6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10 -6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5 -6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8 -6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8 -6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5 -6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5 -6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10 -6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5 -6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10 -6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8 -6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8 -6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8 -6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8 -6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10 -6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8 -6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10 -6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5 -6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8 -6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8 -6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8 -6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8 -6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8 -6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5 -6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5 -6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10 -6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8 -6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8 -6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8 -6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8 -6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10 -6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8 -6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10 -6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5 -6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8 -6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8 -6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5 -6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5 -6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10 -6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5 -6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10 -6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8 -6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8 -6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8 -6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8 -6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10 -6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8 -6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10 -6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5 -6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8 -6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8 -6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8 -6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8 -6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8 -6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5 -6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5 -6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10 -6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8 -6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8 -6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8 -6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8 -6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10 -6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8 -6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10 -6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5 -6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8 -6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8 -6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5 -6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5 -6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10 -6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5 -6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10 -6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8 -6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8 -6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8 -6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8 -6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10 -6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8 -6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10 -6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5 -6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8 -6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8 -6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8 -6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8 -6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8 -6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5 -6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5 -6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10 -6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8 -6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8 -6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8 -6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8 -6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10 -6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8 -6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10 -6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5 -6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8 -6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8 -6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5 -6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5 -6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10 -6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5 -6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10 -6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8 -6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8 -6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8 -6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8 -6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10 -6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8 -6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10 -6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5 -6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8 -6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8 -6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8 -6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8 -6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8 -6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5 -6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5 -6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10 -6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8 -6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8 -6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8 -6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8 -6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10 -6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8 -6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10 -6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5 -6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8 -6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8 -6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5 -6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5 -6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10 -6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5 -6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10 -6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8 -6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8 -6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8 -6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8 -6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10 -6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8 -6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10 -6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5 -6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8 -6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8 -6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8 -6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8 -6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8 -6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5 -6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5 -6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10 -6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8 -6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8 -6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8 -6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8 -6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10 -6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8 -6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10 -6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5 -6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8 -6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8 -7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5 -7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5 -7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10 -7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5 -7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10 -7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8 -7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8 -7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8 -7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8 -7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10 -7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8 -7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10 -7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5 -7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8 -7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8 -7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8 -7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8 -7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8 -7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5 -7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5 -7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10 -7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8 -7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8 -7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8 -7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8 -7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10 -7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8 -7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10 -7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5 -7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8 -7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8 -7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5 -7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5 -7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10 -7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5 -7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10 -7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8 -7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8 -3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8 -7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8 -3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8 -7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8 -7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10 -7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8 -3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5 -3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8 -3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8 -3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8 -3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8 -3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8 -3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8 -3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8 -3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8 -3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8 -3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8 -3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8 -3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8 -4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8 -4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8 -4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8 -4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8 -4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8 -4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8 -4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8 -4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8 -4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8 -4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8 -4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8 -4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8 -4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8 -4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8 -5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8 -5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8 -5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8 -5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8 -5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8 -5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8 -5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8 -5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8 -5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8 -5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8 -5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8 -5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8 -5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8 -5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8 -6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8 -6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8 -6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8 -6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8 -6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8 -6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8 -6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8 -6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8 -6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8 -6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8 -6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8 -6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8 -6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8 -6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8 -7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8 -7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8 -7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8 -7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10 -3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5 -3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5 -3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5 -3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5 -3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5 -3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5 -3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5 -3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5 -3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5 -3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5 -3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5 -3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5 -3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5 -3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5 -3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5 -3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5 -3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5 -3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5 -3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5 -3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5 -3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5 -3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5 -3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5 -3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5 -3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5 -3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5 -3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5 -3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5 -3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5 -3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5 -3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5 -3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5 -3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5 -3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5 -3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5 -3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5 -3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5 -3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5 -3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5 -4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5 -4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5 -4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5 -4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5 -4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5 -4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5 -4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5 -4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5 -4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5 -4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5 -4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5 -4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5 -4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5 -4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5 -4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5 -4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5 -4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5 -4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5 -4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5 -4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5 -4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5 -4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5 -4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5 -4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5 -4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5 -4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5 -4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5 -4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5 -4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5 -4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5 -4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5 -4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5 -4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5 -4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5 -4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5 -4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5 -4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5 -4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5 -4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5 -4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5 -4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5 -4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5 -4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5 -4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5 -4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5 -4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5 -4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5 -4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5 -4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5 -5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5 -5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5 -5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5 -5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5 -5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5 -5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5 -5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5 -5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5 -5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5 -5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5 -5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5 -5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5 -5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5 -5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5 -5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5 -5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5 -5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5 -5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5 -5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5 -5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5 -5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5 -5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5 -5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5 -5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5 -5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5 -5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5 -5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5 -5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5 -5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5 -5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5 -5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5 -5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5 -5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5 -5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5 -5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5 -5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5 -5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5 -5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5 -5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5 -5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5 -5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5 -5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5 -5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5 -5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5 -5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5 -5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5 -5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5 -5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5 -5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5 -6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5 -6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5 -6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5 -6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5 -6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5 -6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5 -6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5 -6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5 -6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5 -6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5 -6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5 -6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5 -6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5 -6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5 -6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5 -6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5 -6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5 -6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5 -6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5 -6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5 -6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5 -6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5 -6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5 -6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5 -6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5 -6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5 -6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5 -6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5 -6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5 -6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5 -6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5 -6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5 -6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5 -6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5 -6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5 -6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5 -6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5 -6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5 -6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5 -6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5 -6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5 -6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5 -6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5 -6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5 -6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5 -6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5 -6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5 -6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5 -6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5 -7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5 -7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5 -7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5 -7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5 -7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5 -7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5 -7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5 -7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5 -7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5 -7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5 -7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5 -3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5 -7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9 diff --git a/profile_results/real_llama3_by_mesh_dim.svg b/profile_results/real_llama3_by_mesh_dim.svg deleted file mode 100644 index 6eb41508..00000000 --- a/profile_results/real_llama3_by_mesh_dim.svg +++ /dev/null @@ -1,167 +0,0 @@ - - -Real Llama3 optimizer profile vs mesh dimension -Y axes are log scale. Missing series points timed out or were not run. - -model_key=1B - -model_key=3B - -model_key=405B - -model_key=70B - -model_key=8B -strategy enum (s) - - - -9.9 -0.54 -1 -2 - - - - - - - - - - - -cost estimation (s) - - - -5 -0.54 -1 -2 - - - - - - - - - - - -ILP construction (s) - - - -14 -0.26 -1 -2 - - - - - - - - - - - -objective build (s) - - - -3.3 -0.053 -1 -2 - - - - - - - - - - - -solve (s) - - - -86 -0.49 -1 -2 - - - - - - - - - - - -pipeline total (s) - - - -124 -3 -1 -2 - - - - - - - - - - - -unique ILP vars - - - -488.5K -13.0K -1 -2 - - - - - - - - - - - -constraints - - - -177.2K -7.0K -1 -2 - - - - - - - - - - - - \ No newline at end of file diff --git a/profile_results/real_llama3_by_model_size.svg b/profile_results/real_llama3_by_model_size.svg deleted file mode 100644 index 11fabae2..00000000 --- a/profile_results/real_llama3_by_model_size.svg +++ /dev/null @@ -1,177 +0,0 @@ - - -Real Llama3 optimizer profile vs model size -Y axes are log scale. Missing series points timed out or were not run. - -mesh_ndim=1 - -mesh_ndim=2 -strategy enum (s) - - - -9.9 -0.54 -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -cost estimation (s) - - - -5 -0.54 -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -ILP construction (s) - - - -14 -0.26 -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -objective build (s) - - - -3.3 -0.053 -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -solve (s) - - - -86 -0.49 -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -pipeline total (s) - - - -124 -3 -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -unique ILP vars - - - -488.5K -13.0K -1.2 -3.2 -8 -71 -406 - - - - - - - - - - -constraints - - - -177.2K -7.0K -1.2 -3.2 -8 -71 -406 - - - - - - - - - - - \ No newline at end of file diff --git a/profile_results/real_llama3_dag_analysis.py b/profile_results/real_llama3_dag_analysis.py deleted file mode 100644 index 03b445a3..00000000 --- a/profile_results/real_llama3_dag_analysis.py +++ /dev/null @@ -1,255 +0,0 @@ -import csv -import json -import logging -import re -import sys -import time -from collections import Counter, defaultdict -from pathlib import Path - -import networkx as nx -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -sys.path.insert(0, "/home/wangkj/workspace/torchtitan") - -from torchtitan.models.llama3 import llama3_configs # noqa: E402 - -from autoparallel.api import AutoParallel -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - - -WORLD_SIZE = 64 -SEQ_LEN = 256 -GLOBAL_BATCH = 64 - - -def init_dist(): - if not torch.distributed.is_initialized(): - torch.distributed.init_process_group( - "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE - ) - - -def target_name(node): - target = node.target - if hasattr(target, "__name__"): - return target.__name__ - return str(target) - - -def layer_id(node): - stacks = [] - for key in ("nn_module_stack", "fwd_nn_module_stack"): - value = node.meta.get(key) - if value: - stacks.append(str(value)) - text = " ".join(stacks) - match = re.search(r"layers[._']+([0-9]+)", text) - return int(match.group(1)) if match else "" - - -def phase(node): - if "fwd_nn_module_stack" in node.meta: - return "backward" - if "nn_module_stack" in node.meta: - return "forward" - if node.op == "placeholder" and str(node.name).startswith("tangents"): - return "backward" - return "unknown" - - -def bitset_counts(nodes, edges): - idx = {node: i for i, node in enumerate(nodes)} - children = [[] for _ in nodes] - parents = [[] for _ in nodes] - for src, dst in edges: - children[idx[src]].append(idx[dst]) - parents[idx[dst]].append(idx[src]) - - descendants = [0] * len(nodes) - for i in range(len(nodes) - 1, -1, -1): - bits = 0 - for child in children[i]: - bits |= 1 << child - bits |= descendants[child] - descendants[i] = bits - - ancestors = [0] * len(nodes) - for i in range(len(nodes)): - bits = 0 - for parent in parents[i]: - bits |= 1 << parent - bits |= ancestors[parent] - ancestors[i] = bits - - return ( - [bits.bit_count() for bits in ancestors], - [bits.bit_count() for bits in descendants], - ) - - -def treewidth_upper_bounds(edges): - graph = nx.Graph() - graph.add_edges_from(edges) - width_min_fill, _ = nx.approximation.treewidth_min_fill_in(graph) - width_min_degree, _ = nx.approximation.treewidth_min_degree(graph) - - moral = graph.copy() - parents_by_child = defaultdict(list) - for src, dst in edges: - parents_by_child[dst].append(src) - for parents in parents_by_child.values(): - for i, left in enumerate(parents): - for right in parents[i + 1 :]: - moral.add_edge(left, right) - moral_width_min_fill, _ = nx.approximation.treewidth_min_fill_in(moral) - moral_width_min_degree, _ = nx.approximation.treewidth_min_degree(moral) - return { - "undirected_min_fill": width_min_fill, - "undirected_min_degree": width_min_degree, - "moralized_min_fill": moral_width_min_fill, - "moralized_min_degree": moral_width_min_degree, - "undirected_edges": graph.number_of_edges(), - "moralized_edges": moral.number_of_edges(), - } - - -def run_analysis(out_dir): - init_dist() - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", (64,), mesh_dim_names=("dp",) - ) - set_nccl_topo_config(detect_nccl_topo_config(mesh)) - - config = llama3_configs["3B"](attn_backend="sdpa") - config.rope.max_seq_len = SEQ_LEN - with torch.device("meta"): - model = config.build() - - def input_fn(): - return torch.randint(0, config.vocab_size, (GLOBAL_BATCH, SEQ_LEN), device="cuda") - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, reduce_dtype=torch.float32 - ) - t0 = time.perf_counter() - with AutoParallel( - model, input_fn, mesh, mp_policy, repeated_subgraphs=True - ) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([(Shard(0),)]) - autop.add_output_constraints([(Shard(0),)]) - opt = autop.sharding_optimizer - - ilp_nodes = [node for node in opt.nodes if node.op != "output"] - ilp_node_set = set(ilp_nodes) - edges = [] - dep_args = {} - dep_unique = {} - for node in ilp_nodes: - inputs = [inp for inp in opt._all_input_nodes(node) if inp in ilp_node_set] - dep_args[node] = len(inputs) - dep_unique[node] = len(set(inputs)) - for inp in set(inputs): - edges.append((inp, node)) - - offspring = Counter() - for src, _dst in edges: - offspring[src] += 1 - - ancestor_counts, descendant_counts = bitset_counts(ilp_nodes, edges) - node_to_idx = {node: i for i, node in enumerate(ilp_nodes)} - treewidth = treewidth_upper_bounds(edges) - - rows = [] - for node in ilp_nodes: - idx = node_to_idx[node] - rows.append( - { - "idx": idx, - "name": node.name, - "op": node.op, - "target": target_name(node), - "phase": phase(node), - "layer": layer_id(node), - "direct_dependency_args": dep_args[node], - "direct_dependency_nodes": dep_unique[node], - "direct_offspring_nodes": offspring[node], - "ancestor_count": ancestor_counts[idx], - "descendant_count": descendant_counts[idx], - "strategy_count": len(opt.strats[node].strategies), - } - ) - - merge_points = [ - row for row in rows if int(row["direct_dependency_nodes"]) > 1 - ] - merge_points.sort( - key=lambda row: ( - -int(row["direct_dependency_nodes"]), - -int(row["descendant_count"]), - int(row["idx"]), - ) - ) - fanout_points = sorted( - rows, - key=lambda row: (-int(row["direct_offspring_nodes"]), int(row["idx"])), - ) - - out_dir = Path(out_dir) - out_dir.mkdir(parents=True, exist_ok=True) - node_csv = out_dir / "real_llama3_3b_dag_node_stats.csv" - with node_csv.open("w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) - writer.writeheader() - writer.writerows(rows) - - merge_csv = out_dir / "real_llama3_3b_merge_points.csv" - with merge_csv.open("w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) - writer.writeheader() - writer.writerows(merge_points) - - summary = { - "model": "LLaMA3 3B", - "mesh": "1D 64", - "trace_and_optimizer_build_s": time.perf_counter() - t0, - "ilp_nodes": len(ilp_nodes), - "dag_edges": len(edges), - "merge_points": len(merge_points), - "branch_points": sum(1 for row in rows if int(row["direct_offspring_nodes"]) > 1), - "max_direct_dependency_nodes": max(int(row["direct_dependency_nodes"]) for row in rows), - "max_direct_offspring_nodes": max(int(row["direct_offspring_nodes"]) for row in rows), - "max_ancestor_count": max(int(row["ancestor_count"]) for row in rows), - "max_descendant_count": max(int(row["descendant_count"]) for row in rows), - "treewidth_upper_bounds": treewidth, - "direct_dependency_histogram": dict( - sorted(Counter(int(row["direct_dependency_nodes"]) for row in rows).items()) - ), - "direct_offspring_histogram": dict( - sorted(Counter(int(row["direct_offspring_nodes"]) for row in rows).items()) - ), - "top_merge_points": merge_points[:30], - "top_fanout_points": fanout_points[:30], - "node_stats_csv": str(node_csv), - "merge_points_csv": str(merge_csv), - } - summary_path = out_dir / "real_llama3_3b_dag_summary.json" - summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True)) - print(json.dumps(summary, indent=2, sort_keys=True)) - - -def main(): - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s %(levelname)s:%(name)s:%(message)s", - ) - run_analysis("profile_results") - - -if __name__ == "__main__": - main() diff --git a/profile_results/real_llama3_optimizer_presolve_3d4d.log b/profile_results/real_llama3_optimizer_presolve_3d4d.log deleted file mode 100644 index 923ec1f1..00000000 --- a/profile_results/real_llama3_optimizer_presolve_3d4d.log +++ /dev/null @@ -1,7 +0,0 @@ -[14:50:20] start model=1B mesh_ndim=3 skip_solve timeout=1200s -2026-05-26 14:50:29,648 INFO:autoparallel.api:Graph tracing took 6.073s -2026-05-26 14:58:18,227 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=strategy_enumeration mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B graph_nodes=4140 strategy_options=662279 option_tuples=181062856 elapsed=459.509s -2026-05-26 15:07:42,067 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=decision_vars mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B unique_ilp_vars=20390366 logical_decision_vars=181062856 cluster_copied_decision_vars=160672490 elapsed=462.310s -[15:10:23] done model=1B mesh_ndim=3 rc=124 -[15:10:23] start model=1B mesh_ndim=4 skip_solve timeout=1200s -2026-05-26 15:10:32,788 INFO:autoparallel.api:Graph tracing took 6.079s diff --git a/profile_results/real_llama3_optimizer_sweep.csv b/profile_results/real_llama3_optimizer_sweep.csv deleted file mode 100644 index 30d2e4f5..00000000 --- a/profile_results/real_llama3_optimizer_sweep.csv +++ /dev/null @@ -1,9 +0,0 @@ -cluster_copied_decision_vars,compute_cost_estimation_s,constraints_init,constraints_solve,cost_estimation_s,decision_var_build_s,decision_var_overhead_s,edge_cost_estimation_s,extract_s,graph_nodes,ilp_construction_s,logical_decision_vars,max_strategies_per_node,mesh_ndim,mesh_shape,mesh_size,model_key,objective,objective_s,optimizer_pipeline_s,option_tuples,parameter_b,parameter_gib,parameter_nodes,parameter_numel,solve_s,status,strategy_enumeration_s,strategy_options,tensor_nodes,total_wall_s,unique_ilp_vars,validation_s -101888,0.4790569522883743,7038,7042,0.5384756466373801,0.6978608381468803,0.09741721651516855,0.05941869434900582,0.01627982617355883,4140,0.26339267240837216,114928,10,1,64,64,1B,75411.02054353141,0.053351440001279116,3.032338660908863,114928,1.2358144,4.603767395019531,146,1235814400,0.49132931185886264,Optimal,0.6722944700159132,18503,4139,8.946083615999669,13040,0.31402136106044054 -194792,0.48607375379651785,8080,8084,0.5489266884978861,0.7471572819631547,0.1306607834994793,0.06285293470136821,0.029804171063005924,7200,0.3148333504796028,208698,10,1,64,64,3B,155857.5709074804,0.05978171294555068,4.169702837942168,208698,3.212749824,11.968425750732422,254,3212749824,0.5855939809698611,Optimal,0.5360530489124358,32969,7199,14.472710577072576,13906,0.03955800808034837 -224240,0.49045672081410885,8372,8376,0.5536671618465334,1.1619730349630117,0.5399796243291348,0.06321044103242457,0.03362119919620454,8220,0.7288401401601732,238203,10,1,64,64,8B,213343.3574716149,0.05892709596082568,4.762722868937999,238203,8.030261248,29.915054321289062,291,8030261248,0.5859912640880793,Optimal,0.9387421838473529,37635,8219,16.452271425863728,13963,0.045778295025229454 -596400,0.5983547926880419,12044,12048,0.6777467841748148,2.653722374001518,1.875488000921905,0.0793919914867729,0.2056352950166911,20460,2.2220516917295754,612283,10,1,64,64,70B,965500.0409067452,0.0730493909213692,20.028923405101523,612283,70.553706496,262.8330383300781,723,70553706496,1.5257919810246676,Optimal,3.3026473850477487,95379,20459,50.90600106609054,15883,0.1628595821093768 -946046,0.4775047143921256,15494,15498,0.5445251299533993,2.283439102116972,1.6354041469749063,0.0670204155612737,0.17483325605280697,32190,2.005914915120229,963447,10,1,64,64,405B,3172012.7008089907,0.06962158717215061,29.85055986023508,963447,405.8533888,1511.9216918945312,1137,405853388800,2.56223003892228,Optimal,2.5583339028526098,150073,32189,77.86599416891113,17401,0.18959671608172357 -3854214,1.9979437342844903,173186,173190,4.75627763918601,11.933482899097726,4.112962566781789,2.7583339049015194,0.03040059795603156,4140,10.42051934893243,4337060,82,2,8x8,64,1B,57041.81060181375,2.17517895414494,109.2090197771322,4337060,1.2358144,4.603767395019531,146,1235814400,80.18635749211535,Optimal,8.398531069047749,107753,4139,115.10326781589538,482846,0.024392321007326245 -7135218,2.101260715862736,176564,176568,5.0140090675558895,14.759843383915722,6.347998866345733,2.9127483516931534,0.04800663981586695,7200,14.323183785192668,7623714,82,2,8x8,64,3B,122291.9385011857,2.4431078990455717,118.39831594773568,7623714,3.212749824,11.968425750732422,254,3212749824,78.84844117495231,Optimal,9.923545255092904,188315,7199,130.30269417585805,488496,0.053027451038360596 -8216282,1.9884659524541348,177172,177176,4.743945160182193,13.453818985959515,5.6245344209019095,2.755479207728058,0.04394924081861973,8220,11.563520586816594,8703393,82,2,8x8,64,8B,178228.3264244111,3.2896198199596256,123.55457829684019,8703393,8.030261248,29.915054321289062,291,8030261248,86.02262015617453,Optimal,9.262494687922299,214965,8219,135.2341975120362,487111,0.0497884638607502 diff --git a/profile_results/real_llama3_optimizer_sweep.jsonl b/profile_results/real_llama3_optimizer_sweep.jsonl deleted file mode 100644 index 67428955..00000000 --- a/profile_results/real_llama3_optimizer_sweep.jsonl +++ /dev/null @@ -1,8 +0,0 @@ -{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054} -{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837} -{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454} -{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768} -{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357} -{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245} -{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596} -{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502} diff --git a/profile_results/real_llama3_optimizer_sweep.log b/profile_results/real_llama3_optimizer_sweep.log deleted file mode 100644 index 21b02b4e..00000000 --- a/profile_results/real_llama3_optimizer_sweep.log +++ /dev/null @@ -1,54 +0,0 @@ -[14:16:02] start model=1B mesh_ndim=1 timeout=900s -2026-05-26 14:16:10,889 INFO:autoparallel.api:Graph tracing took 5.582s -2026-05-26 14:16:13,492 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=18503 option_tuples=114928 unique_ilp_vars=13040 logical_decision_vars=114928 constraints=7038 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,validation=0.314s,total=2.469s} -2026-05-26 14:16:14,059 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B unique_ilp_vars=13040 constraints=7042 status=Optimal objective=75411.0205 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,objective=0.053s,solve=0.491s,extract=0.016s,total_solve_call=0.563s,total_pipeline=3.032s} -{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054} -[14:16:15] done model=1B mesh_ndim=1 rc=0 -[14:16:15] start model=3B mesh_ndim=1 timeout=900s -2026-05-26 14:16:27,671 INFO:autoparallel.api:Graph tracing took 9.505s -2026-05-26 14:16:31,732 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=32969 option_tuples=208698 unique_ilp_vars=13906 logical_decision_vars=208698 constraints=8080 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,validation=0.040s,total=3.492s} -2026-05-26 14:16:32,416 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B unique_ilp_vars=13906 constraints=8084 status=Optimal objective=155857.5709 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,objective=0.060s,solve=0.586s,extract=0.030s,total_solve_call=0.678s,total_pipeline=4.170s} -{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837} -[14:16:33] done model=3B mesh_ndim=1 rc=0 -[14:16:33] start model=8B mesh_ndim=1 timeout=900s -2026-05-26 14:16:47,847 INFO:autoparallel.api:Graph tracing took 11.170s -2026-05-26 14:16:52,205 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=37635 option_tuples=238203 unique_ilp_vars=13963 logical_decision_vars=238203 constraints=8372 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,validation=0.046s,total=4.081s} -2026-05-26 14:16:52,893 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B unique_ilp_vars=13963 constraints=8376 status=Optimal objective=213343.3575 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,objective=0.059s,solve=0.586s,extract=0.034s,total_solve_call=0.681s,total_pipeline=4.763s} -{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454} -[14:16:54] done model=8B mesh_ndim=1 rc=0 -[14:16:54] start model=70B mesh_ndim=1 timeout=900s -2026-05-26 14:17:27,109 INFO:autoparallel.api:Graph tracing took 29.053s -2026-05-26 14:17:46,179 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B param_nodes=723 graph_nodes=20460 tensor_nodes=20459 strategy_options=95379 option_tuples=612283 unique_ilp_vars=15883 logical_decision_vars=612283 constraints=12044 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,validation=0.163s,total=18.219s} -2026-05-26 14:17:48,011 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B unique_ilp_vars=15883 constraints=12048 status=Optimal objective=965500.0409 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,objective=0.073s,solve=1.526s,extract=0.206s,total_solve_call=1.810s,total_pipeline=20.029s} -{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768} -[14:17:51] done model=70B mesh_ndim=1 rc=0 -[14:17:51] start model=405B mesh_ndim=1 timeout=900s -2026-05-26 14:18:40,587 INFO:autoparallel.api:Graph tracing took 45.218s -2026-05-26 14:19:09,868 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B param_nodes=1137 graph_nodes=32190 tensor_nodes=32189 strategy_options=150073 option_tuples=963447 unique_ilp_vars=17401 logical_decision_vars=963447 constraints=15494 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,validation=0.190s,total=27.039s} -2026-05-26 14:19:12,705 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B unique_ilp_vars=17401 constraints=15498 status=Optimal objective=3172012.7008 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,objective=0.070s,solve=2.562s,extract=0.175s,total_solve_call=2.811s,total_pipeline=29.851s} -{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357} -[14:19:15] done model=405B mesh_ndim=1 rc=0 -[14:19:15] start model=1B mesh_ndim=2 timeout=900s -2026-05-26 14:19:24,184 INFO:autoparallel.api:Graph tracing took 5.551s -2026-05-26 14:19:51,030 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=107753 option_tuples=4337060 unique_ilp_vars=482846 logical_decision_vars=4337060 constraints=173186 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,validation=0.024s,total=26.710s} -2026-05-26 14:21:13,538 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B unique_ilp_vars=482846 constraints=173190 status=Optimal objective=57041.8106 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,objective=2.175s,solve=80.186s,extract=0.030s,total_solve_call=82.499s,total_pipeline=109.209s} -{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245} -[14:21:16] done model=1B mesh_ndim=2 rc=0 -[14:21:16] start model=3B mesh_ndim=2 timeout=900s -2026-05-26 14:21:30,429 INFO:autoparallel.api:Graph tracing took 10.867s -2026-05-26 14:22:08,135 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=188315 option_tuples=7623714 unique_ilp_vars=488496 logical_decision_vars=7623714 constraints=176564 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,validation=0.053s,total=36.956s} -2026-05-26 14:23:29,596 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B unique_ilp_vars=488496 constraints=176568 status=Optimal objective=122291.9385 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,objective=2.443s,solve=78.848s,extract=0.048s,total_solve_call=81.443s,total_pipeline=118.398s} -{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596} -[14:23:32] done model=3B mesh_ndim=2 rc=0 -[14:23:32] start model=8B mesh_ndim=2 timeout=900s -2026-05-26 14:23:46,265 INFO:autoparallel.api:Graph tracing took 11.149s -2026-05-26 14:24:20,655 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=214965 option_tuples=8703393 unique_ilp_vars=487111 logical_decision_vars=8703393 constraints=177172 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,validation=0.050s,total=34.114s} -2026-05-26 14:25:50,114 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B unique_ilp_vars=487111 constraints=177176 status=Optimal objective=178228.3264 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,objective=3.290s,solve=86.023s,extract=0.044s,total_solve_call=89.441s,total_pipeline=123.555s} -{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502} -[14:25:52] done model=8B mesh_ndim=2 rc=0 -[14:25:52] start model=1B mesh_ndim=3 timeout=300s -2026-05-26 14:26:01,331 INFO:autoparallel.api:Graph tracing took 5.531s -[14:30:53] done model=1B mesh_ndim=3 rc=124 -[14:30:53] start model=1B mesh_ndim=4 timeout=300s -2026-05-26 14:31:01,610 INFO:autoparallel.api:Graph tracing took 5.635s -[14:35:53] done model=1B mesh_ndim=4 rc=124 diff --git a/profile_results/real_llama3_optimizer_sweep.py b/profile_results/real_llama3_optimizer_sweep.py deleted file mode 100644 index 7e32b14c..00000000 --- a/profile_results/real_llama3_optimizer_sweep.py +++ /dev/null @@ -1,351 +0,0 @@ -import argparse -import csv -import json -import logging -import math -import sys -import time -from pathlib import Path - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -sys.path.insert(0, "/home/wangkj/workspace/torchtitan") - -from torchtitan.models.llama3 import llama3_configs # noqa: E402 - -from autoparallel.api import AutoParallel -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - - -WORLD_SIZE = 64 -SEQ_LEN = 256 -GLOBAL_BATCH = 64 -MESHES = { - 1: ((64,), ("dp",)), - 2: ((8, 8), ("dp", "tp")), - 3: ((4, 4, 4), ("dp", "tp", "cp")), - 4: ((4, 4, 2, 2), ("dp", "tp", "cp", "ep")), -} - - -def init_dist(): - if not torch.distributed.is_initialized(): - torch.distributed.init_process_group( - "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE - ) - - -def flatten_profile(model_key, mesh_ndim, profile, total_wall_s, solve_ran): - model = profile["model"] - timings = profile["timings"] - strategies = profile["strategies"] - ilp = profile["ilp"] - solve = profile.get("last_solve", {}) - return { - "model_key": model_key, - "mesh_ndim": mesh_ndim, - "mesh_shape": "x".join(map(str, profile["mesh"]["shape"])), - "mesh_size": profile["mesh"]["size"], - "parameter_numel": model["parameter_numel"], - "parameter_b": model["parameter_numel"] / 1_000_000_000, - "parameter_gib": model["parameter_bytes"] / (1024**3), - "graph_nodes": model["graph_nodes"], - "tensor_nodes": model["tensor_nodes"], - "parameter_nodes": model["parameter_nodes"], - "strategy_options": strategies["strategy_options"], - "option_tuples": strategies["option_tuples"], - "max_strategies_per_node": strategies["max_strategies_per_node"], - "unique_ilp_vars": ilp["unique_variables"], - "logical_decision_vars": ilp["logical_decision_variables"], - "cluster_copied_decision_vars": ilp["cluster_copied_decision_variables"], - "constraints_init": ilp["constraints"], - "constraints_presolve": profile.get("constraints_presolve", ilp["constraints"]), - "constraints_solve": solve.get("constraints", ""), - "strategy_enumeration_s": timings["strategy_enumeration_s"], - "compute_cost_estimation_s": timings["compute_cost_estimation_s"], - "edge_cost_estimation_s": timings["edge_cost_estimation_s"], - "cost_estimation_s": timings["cost_estimation_s"], - "decision_var_build_s": timings["decision_var_build_s"], - "decision_var_overhead_s": timings["decision_var_overhead_s"], - "ilp_construction_s": timings["ilp_construction_s"], - "validation_s": timings["validation_s"], - "objective_s": solve.get("objective_s", ""), - "solve_s": solve.get("solve_s", ""), - "extract_s": solve.get("extract_s", ""), - "optimizer_pipeline_s": solve.get( - "pipeline_total_s", - timings["init_total_s"], - ), - "total_wall_s": total_wall_s, - "objective": solve.get("objective", ""), - "status": solve.get("status", "NotSolved"), - "solve_ran": solve_ran, - } - - -def run_one(model_key, mesh_ndim, skip_solve=False): - init_dist() - mesh_shape, mesh_dim_names = MESHES[mesh_ndim] - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", mesh_shape, mesh_dim_names=mesh_dim_names - ) - set_nccl_topo_config(detect_nccl_topo_config(mesh)) - - config = llama3_configs[model_key](attn_backend="sdpa") - config.rope.max_seq_len = SEQ_LEN - with torch.device("meta"): - model = config.build() - - def input_fn(): - return torch.randint( - 0, - config.vocab_size, - (GLOBAL_BATCH, SEQ_LEN), - device="cuda", - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, reduce_dtype=torch.float32 - ) - t0 = time.perf_counter() - with AutoParallel( - model, - input_fn, - mesh, - mp_policy, - repeated_subgraphs=True, - ) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - input_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1) - if mesh.ndim == 1: - output_sharding = (Shard(0),) - else: - output_sharding = (Shard(0), Shard(2)) + (Replicate(),) * ( - mesh.ndim - 2 - ) - autop.add_input_constraints([input_sharding]) - autop.add_output_constraints([output_sharding]) - autop.sharding_optimizer.profile["constraints_presolve"] = len( - autop.sharding_optimizer.prob.constraints - ) - if not skip_solve: - autop.optimize_placement(verbose=False) - profile = autop.sharding_optimizer.profile - return flatten_profile( - model_key, - mesh_ndim, - profile, - time.perf_counter() - t0, - solve_ran=not skip_solve, - ) - - -def append_jsonl(path, row): - path = Path(path) - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("a") as f: - f.write(json.dumps(row, sort_keys=True) + "\n") - - -def load_rows(path): - rows = [] - with Path(path).open() as f: - for line in f: - line = line.strip() - if line: - row = json.loads(line) - row.setdefault( - "constraints_presolve", - row.get("constraints_solve") or row.get("constraints_init"), - ) - row.setdefault("solve_ran", row.get("solve_s", "") != "") - rows.append(row) - rows.sort(key=lambda r: (r["mesh_ndim"], r["parameter_numel"])) - return rows - - -def write_csv(rows, path): - fields = [] - for row in rows: - for key in row: - if key not in fields: - fields.append(key) - with Path(path).open("w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=fields) - writer.writeheader() - writer.writerows(rows) - - -def nice(v): - if v >= 1_000_000_000: - return f"{v / 1_000_000_000:.1f}B" - if v >= 1_000_000: - return f"{v / 1_000_000:.1f}M" - if v >= 1_000: - return f"{v / 1_000:.1f}K" - if v >= 10: - return f"{v:.0f}" - return f"{v:.2g}" - - -def write_svg(rows, path, x_key, series_key, title): - metrics = [ - ("strategy_enumeration_s", "strategy enum (s)"), - ("cost_estimation_s", "cost estimation (s)"), - ("ilp_construction_s", "ILP construction (s)"), - ("objective_s", "objective build (s)"), - ("solve_s", "solve (s)"), - ("optimizer_pipeline_s", "pipeline total (s)"), - ("unique_ilp_vars", "unique ILP vars"), - ("constraints_presolve", "constraints"), - ] - width = 1600 - height = 1000 - panel_w = 360 - panel_h = 180 - margin_l = 62 - margin_t = 120 - gap_x = 30 - gap_y = 50 - colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ea580c"] - - def sx(x, xs, px): - lo, hi = min(xs), max(xs) - if lo == hi: - return px + panel_w / 2 - return px + (x - lo) / (hi - lo) * panel_w - - def sy(y, ys, py): - positives = [v for v in ys if v > 0] - lo = min(positives) - hi = max(positives) - if lo == hi: - return py + panel_h / 2 - return py + panel_h - (math.log10(max(y, lo)) - math.log10(lo)) / ( - math.log10(hi) - math.log10(lo) - ) * panel_h - - series_values = sorted({r[series_key] for r in rows}) - x_values = sorted({float(r[x_key]) for r in rows}) - svg = [ - f'', - '', - f'{title}', - 'Y axes are log scale. Missing series points timed out or were not run.', - ] - for i, value in enumerate(series_values): - x = 32 + (i % 8) * 180 - y = 84 + (i // 8) * 20 - svg.append( - f'' - ) - svg.append( - f'{series_key}={value}' - ) - - for idx, (metric, label) in enumerate(metrics): - col = idx % 4 - row = idx // 4 - px = margin_l + col * (panel_w + gap_x) - py = margin_t + row * (panel_h + gap_y) - ys = [ - float(r[metric]) - for r in rows - if r.get(metric) not in {"", None} and float(r[metric]) > 0 - ] - if not ys: - continue - svg.extend( - [ - f'{label}', - f'', - f'', - f'', - f'{nice(max(ys))}', - f'{nice(min(ys))}', - ] - ) - for xv in x_values: - svg.append( - f'{nice(xv)}' - ) - for sidx, series in enumerate(series_values): - pts = sorted( - [r for r in rows if r[series_key] == series], - key=lambda r: float(r[x_key]), - ) - color = colors[sidx % len(colors)] - coords = [ - ( - sx(float(r[x_key]), x_values, px), - sy(float(r[metric]), ys, py), - ) - for r in pts - if r.get(metric) not in {"", None} and float(r[metric]) > 0 - ] - if len(coords) >= 2: - svg.append( - '' - ) - for x, y in coords: - svg.append(f'') - svg.append("") - Path(path).write_text("\n".join(svg)) - - -def plot(jsonl, out_dir): - out_dir = Path(out_dir) - out_dir.mkdir(parents=True, exist_ok=True) - rows = load_rows(jsonl) - write_csv(rows, out_dir / "real_llama3_optimizer_sweep.csv") - write_svg( - rows, - out_dir / "real_llama3_by_model_size.svg", - "parameter_b", - "mesh_ndim", - "Real Llama3 optimizer profile vs model size", - ) - write_svg( - rows, - out_dir / "real_llama3_by_mesh_dim.svg", - "mesh_ndim", - "model_key", - "Real Llama3 optimizer profile vs mesh dimension", - ) - - -def main(): - parser = argparse.ArgumentParser() - sub = parser.add_subparsers(dest="cmd", required=True) - run = sub.add_parser("run-one") - run.add_argument("--model-key", choices=llama3_configs.keys(), required=True) - run.add_argument("--mesh-ndim", type=int, choices=MESHES.keys(), required=True) - run.add_argument("--out-jsonl", required=True) - run.add_argument("--skip-solve", action="store_true") - plot_cmd = sub.add_parser("plot") - plot_cmd.add_argument("--jsonl", required=True) - plot_cmd.add_argument("--out-dir", required=True) - args = parser.parse_args() - - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s %(levelname)s:%(name)s:%(message)s", - ) - logging.getLogger("autoparallel.optimize_sharding").setLevel(logging.INFO) - - if args.cmd == "run-one": - row = run_one(args.model_key, args.mesh_ndim, skip_solve=args.skip_solve) - append_jsonl(args.out_jsonl, row) - print(json.dumps(row, sort_keys=True)) - else: - plot(args.jsonl, args.out_dir) - - -if __name__ == "__main__": - main() diff --git a/profile_results/real_llama3_partial_presolve.csv b/profile_results/real_llama3_partial_presolve.csv deleted file mode 100644 index ab7b7fa9..00000000 --- a/profile_results/real_llama3_partial_presolve.csv +++ /dev/null @@ -1,3 +0,0 @@ -model_key,mesh_ndim,mesh_shape,parameter_b,graph_nodes,strategy_options,option_tuples,strategy_enumeration_s,unique_ilp_vars,logical_decision_vars,cluster_copied_decision_vars,decision_var_build_s,constraints,solve_s,status -1B,3,4x4x4,1.2358144,4140,662279,181062856,459.509,20390366,181062856,160672490,462.310,,,timeout_before_constraints -1B,4,4x4x2x2,1.2358144,,,,,,,,,,,not_run diff --git a/profile_results/real_llama3_timeouts.csv b/profile_results/real_llama3_timeouts.csv deleted file mode 100644 index c3e6c843..00000000 --- a/profile_results/real_llama3_timeouts.csv +++ /dev/null @@ -1,3 +0,0 @@ -model_key,mesh_ndim,mesh_shape,timeout_s,result -1B,3,4x4x4,1200,timeout_after_decision_vars_before_constraints -1B,4,4x4x2x2,,not_run diff --git a/qwen3_8b_autoparallel_30steps.log b/qwen3_8b_autoparallel_30steps.log deleted file mode 120000 index 5cc45d55..00000000 --- a/qwen3_8b_autoparallel_30steps.log +++ /dev/null @@ -1 +0,0 @@ -/tmp/qwen3_8b_autoparallel_30steps.log \ No newline at end of file diff --git a/tests/test_dp_solver.py b/tests/test_dp_solver.py new file mode 100644 index 00000000..3dbb2d10 --- /dev/null +++ b/tests/test_dp_solver.py @@ -0,0 +1,158 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import math +import operator + +import pytest +import torch +import torch.nn.functional as F + +from autoparallel.graph_passes.graph_utils import all_input_nodes +from autoparallel.optimize_sharding import DPBasedShardingSolver + + +class _FakeOptimizer: + def __init__(self, graph): + self.graph = graph + self.strats = {node: object() for node in graph.nodes} + self.nodes = list(self.strats.keys()) + + def _all_input_nodes(self, node): + return [ + input_node + for input_node in all_input_nodes(node) + if input_node in self.strats + ] + + +def _assert_predecessors_match_graph_indegrees(topology): + topology_nodes = set(topology.nodes) + assert set(topology.predecessors) == topology_nodes + assert set(topology.node_to_index) == topology_nodes + + for node in topology.nodes: + expected_predecessors = [ + input_node + for input_node in all_input_nodes(node) + if input_node in topology_nodes + ] + predecessors = topology.predecessors[node] + assert len(predecessors) == len(expected_predecessors) + assert predecessors == expected_predecessors + + +def test_dp_solver_builds_topological_order_for_merge_graph(): + class MergeModule(torch.nn.Module): + def forward(self, x, y): + a = x + y + b = x * 2 + return a + b + + graph = torch.fx.symbolic_trace(MergeModule()).graph + solver = DPBasedShardingSolver(_FakeOptimizer(graph)) + + topology = solver.build_topological_order() + + assert all(node.op != "output" for node in topology.nodes) + assert topology.nodes == [node for node in graph.nodes if node.op != "output"] + _assert_predecessors_match_graph_indegrees(topology) + + for node, predecessors in topology.predecessors.items(): + node_index = topology.node_to_index[node] + for pred in predecessors: + assert topology.node_to_index[pred] < node_index + + merge = topology.nodes[-1] + assert [pred.name for pred in topology.predecessors[merge]] == ["add", "mul"] + + +def test_dp_solver_preserves_duplicate_predecessors(): + class DuplicateInputModule(torch.nn.Module): + def forward(self, x): + return x + x + + graph = torch.fx.symbolic_trace(DuplicateInputModule()).graph + solver = DPBasedShardingSolver(_FakeOptimizer(graph)) + + topology = solver.build_topological_order() + _assert_predecessors_match_graph_indegrees(topology) + + add_node = next(node for node in topology.nodes if node.op == "call_function") + predecessors = topology.predecessors[add_node] + assert len(predecessors) == 2 + assert predecessors[0] is predecessors[1] + assert predecessors[0].name == "x" + + +def test_dp_solver_topology_for_tiny_transformer_forward(): + class TinyTransformerBlock(torch.nn.Module): + def __init__(self): + super().__init__() + self.q = torch.nn.Linear(8, 8) + self.k = torch.nn.Linear(8, 8) + self.v = torch.nn.Linear(8, 8) + self.o = torch.nn.Linear(8, 8) + self.ff1 = torch.nn.Linear(8, 16) + self.ff2 = torch.nn.Linear(16, 8) + + def forward(self, x): + q = self.q(x) + k = self.k(x) + v = self.v(x) + scores = q @ k.transpose(-2, -1) / math.sqrt(8) + attn = F.softmax(scores, dim=-1) + attn_out = attn @ v + x = x + self.o(attn_out) + hidden = F.relu(self.ff1(x)) + return x + self.ff2(hidden) + + block = TinyTransformerBlock() + assert block(torch.randn(2, 4, 8)).shape == (2, 4, 8) + + graph = torch.fx.symbolic_trace(block).graph + solver = DPBasedShardingSolver(_FakeOptimizer(graph)) + + topology = solver.build_topological_order() + _assert_predecessors_match_graph_indegrees(topology) + node_names = [node.name for node in topology.nodes] + + assert node_names == [ + "x", + "q", + "k", + "v", + "transpose", + "matmul", + "truediv", + "softmax", + "matmul_1", + "o", + "add", + "ff1", + "relu", + "ff2", + "add_1", + ] + + add_nodes = [node for node in topology.nodes if node.target is operator.add] + assert [node.name for node in add_nodes] == ["add", "add_1"] + assert [pred.name for pred in topology.predecessors[add_nodes[0]]] == ["x", "o"] + assert [pred.name for pred in topology.predecessors[add_nodes[1]]] == [ + "add", + "ff2", + ] + + +def test_dp_solver_solution_is_not_implemented(): + class SimpleModule(torch.nn.Module): + def forward(self, x): + return x + 1 + + graph = torch.fx.symbolic_trace(SimpleModule()).graph + solver = DPBasedShardingSolver(_FakeOptimizer(graph)) + + with pytest.raises(NotImplementedError, match="only builds topological order"): + solver.get_solution() diff --git a/tests/test_lp_relaxation.py b/tests/test_lp_relaxation.py new file mode 100644 index 00000000..1b03e6fe --- /dev/null +++ b/tests/test_lp_relaxation.py @@ -0,0 +1,103 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import pulp +import pytest +import torch +from conftest import apply_cuda_patches +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel + + +def _fake_dp4_tp4_mesh(): + return torch.distributed.device_mesh.init_device_mesh( + "cuda", + (4, 4), + mesh_dim_names=("dp", "tp"), + ) + + +def _llama3_example_autop(device_mesh): + vocab_size = 128 + seq_len = 16 + batch_size = 2 * device_mesh.shape[0] + model_args = TransformerModelArgs( + dim=64, + n_layers=1, + n_heads=4, + n_kv_heads=2, + vocab_size=vocab_size, + multiple_of=32, + rope_theta=500000, + max_seq_len=seq_len, + ) + with torch.device("meta"): + model = Transformer(model_args) + + def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + ) + return AutoParallel( + model, + input_fn, + device_mesh, + mp_policy, + repeated_subgraphs=True, + ) + + +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +@pytest.mark.filterwarnings("ignore:Using LpProblem.constraints") +def test_lp_relaxation_certifies_llama3_example_search(): + mesh = _fake_dp4_tp4_mesh() + with _llama3_example_autop(mesh) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + x_sharding = (Shard(0), Replicate()) + out_sharding = (Shard(0), Shard(2)) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([out_sharding]) + + opt = autop.sharding_optimizer + + binary_vars = list(opt.pulp_variables.values()) + assert binary_vars + assert all(var.cat == pulp.LpInteger for var in binary_vars) + assert all(var.lowBound == 0 and var.upBound == 1 for var in binary_vars) + + continuous_vars = opt._create_pulp_variables(pulp.LpContinuous) + assert continuous_vars + assert all(var.cat == pulp.LpContinuous for var in continuous_vars.values()) + assert all( + var.lowBound == 0 and var.upBound == 1 for var in continuous_vars.values() + ) + + lower_bound = opt.get_lower_bound() + assert lower_bound.status == "Optimal" + assert math.isfinite(lower_bound.objective) + assert lower_bound.objective >= 0 + + assert not hasattr(opt, "selected_keys") + assert opt.prob.objective is None + assert all(var.cat == pulp.LpInteger for var in opt.pulp_variables.values()) + + solution = opt.get_solution() + feasible_cost = pulp.value(opt.prob.objective) + certificate_gap = ( + feasible_cost - lower_bound.objective + ) / lower_bound.objective + assert solution + assert lower_bound.objective <= feasible_cost + 1e-5 + assert certificate_gap >= -1e-8 + assert math.isfinite(certificate_gap) From ad7ee80972acfd1994e8ff11ec0564d4d479273b Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Fri, 29 May 2026 15:48:49 -0700 Subject: [PATCH 04/27] Checkpoint scratch LP benchmark and ignore reference PDFs Snapshot the current working tree before adding the approximate sharding solver. Tracks the scratch _bench_lp_3d.py benchmark and adds *.pdf to .gitignore so reference papers stay out of git history. Authored with Claude. --- .gitignore | 2 + examples/_bench_lp_3d.py | 107 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 examples/_bench_lp_3d.py diff --git a/.gitignore b/.gitignore index ff4f7532..1a6228f1 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ .mypy_cache/ *.egg-info/ +*.pdf + build/ dist/ tmp/ diff --git a/examples/_bench_lp_3d.py b/examples/_bench_lp_3d.py new file mode 100644 index 00000000..5b08840b --- /dev/null +++ b/examples/_bench_lp_3d.py @@ -0,0 +1,107 @@ +"""Benchmark LP-relaxation solve time for LLaMA3 on a 3D mesh.""" +import logging +import os +import time + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.WARNING) + +MODEL_TYPE = os.environ.get("MODEL_TYPE", "8b") +N_LAYERS = int(os.environ.get("N_LAYERS", "0")) # 0 => use default for model +SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4))) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +MESH_NAMES = ("dp", "cp", "tp") + +world_size = 1 +for d in MESH_SHAPE: + world_size *= d + +fake_store = FakeStore() +torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size) + +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=MESH_NAMES) + +batch_size = 2 * mesh.shape[0] +seqlen = SEQLEN +vocab_size = 128256 +device = torch.device("cuda") + + +def model_fn(): + if MODEL_TYPE == "1b": + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + elif MODEL_TYPE == "8b": + args = TransformerModelArgs( + dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + elif MODEL_TYPE == "70b": + args = TransformerModelArgs( + dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, + ffn_dim_multiplier=1.3, multiple_of=4096, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + else: + raise ValueError(MODEL_TYPE) + if N_LAYERS: + args.n_layers = N_LAYERS + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seqlen), device=device) + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) + +with torch.device("meta"): + model = model_fn() + +mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + +print(f"=== model={MODEL_TYPE} n_layers={model.model_args.n_layers} " + f"mesh={MESH_SHAPE}{MESH_NAMES} world_size={world_size} ===") + +print("[build] entering AutoParallel (graph export + strategy enumeration)...", flush=True) +t_build = time.perf_counter() +with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True) as autop: + print(f"[build] AutoParallel ready in {time.perf_counter() - t_build:.2f} s", flush=True) + autop.add_parameter_memory_constraint(low=None, high=None) + x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([x_sharding]) + print(f"[build+constraints] {time.perf_counter() - t_build:.2f} s") + + opt = autop.sharding_optimizer + print(f"[problem] unique_vars={len(opt.pulp_variables)} " + f"constraints={len(opt.prob.constraints)}", flush=True) + + mode = os.environ.get("SOLVE_MODE", "lp") # lp | ilp | both + + if mode in ("lp", "both"): + res = opt.get_lower_bound(verbose=False) + print(f"[LP relaxation] status={res.status} objective={res.objective:.4f}") + print(f"[LP relaxation] solve_s={res.solve_s:.3f} total_s={res.total_s:.3f}", flush=True) + + if mode in ("ilp", "both"): + print("[ILP] solving (this may take a long time)...", flush=True) + t_ilp = time.perf_counter() + opt.get_solution(verbose=True) + import pulp + obj = pulp.value(opt.prob.objective) + print(f"[ILP] status={pulp.LpStatus[opt.prob.status]} objective={obj}") + print(f"[ILP] solve+extract_s={time.perf_counter() - t_ilp:.3f}", flush=True) From 6613928a8320b02c982f6a62b17102615706a693 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 10:51:28 -0700 Subject: [PATCH 05/27] Add approximate belief-propagation sharding solver Adds a heuristic alternative to the ILP for the placement problem, formulated as pairwise MRF energy minimization on the strategy DAG and solved with a sequential min-sum belief propagation over coupled groups, followed by coordinate-descent and star-block local search. The energy is an exact transcription of the ILP objective, so the assignment is scored identically and the gap is small (LP-certified within ~3-8% on LLaMA3 1B), while the solve runs ~10x faster than CBC and works on 3D meshes where the ILP is intractable. Exposed via optimize_placement(solver="approx"). Review order: optimize_sharding.py (idempotent _set_objective) and api.py (solver dispatch) are the integration points; approximate_sharding.py is the solver; test_approximate_sharding.py checks the objective gap, energy faithfulness, and flow feasibility against the ILP. Authored with Claude. --- autoparallel/api.py | 23 +- autoparallel/approximate_sharding.py | 1058 ++++++++++++++++++++++++++ autoparallel/optimize_sharding.py | 10 +- examples/_bench_approx.py | 166 ++++ tests/test_approximate_sharding.py | 140 ++++ 5 files changed, 1394 insertions(+), 3 deletions(-) create mode 100644 autoparallel/approximate_sharding.py create mode 100644 examples/_bench_approx.py create mode 100644 tests/test_approximate_sharding.py diff --git a/autoparallel/api.py b/autoparallel/api.py index 1670d509..907d6111 100644 --- a/autoparallel/api.py +++ b/autoparallel/api.py @@ -356,10 +356,29 @@ def add_output_constraints(self, constraints): self.sharding_optimizer.add_sharded_output_constraint(constraints) self.output_constraints = constraints - def optimize_placement(self, verbose=True): + def optimize_placement(self, verbose=True, solver="ilp", approximate_options=None): + """Solve for the optimal placement. + + solver="ilp" (default) uses the exact PuLP/CBC solver. solver="approx" + uses the heuristic ApproximateShardingSolver, which trades a small + objective gap for a much faster solve. approximate_options is forwarded + as kwargs to the approximate solver (e.g. candidate_limit, max_sweeps). + """ self._assert_entered() - self.sharding_placement = self.sharding_optimizer.get_solution(verbose=False) + if solver in ("approx", "approximate"): + from .approximate_sharding import ApproximateShardingSolver + + approx = ApproximateShardingSolver( + self.sharding_optimizer, **(approximate_options or {}) + ) + self.sharding_placement = approx.get_solution(verbose=verbose) + elif solver == "ilp": + self.sharding_placement = self.sharding_optimizer.get_solution( + verbose=False + ) + else: + raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'") if verbose: logger.info(self.sharding_optimizer.get_log(verbose=True)) diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py new file mode 100644 index 00000000..aba6111c --- /dev/null +++ b/autoparallel/approximate_sharding.py @@ -0,0 +1,1058 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +""" +Approximate sharding solver. + +The ILP in :mod:`optimize_sharding` selects, for every operation, an output +placement and (per argument) the input placement of its producer. The flow +constraint forces a consumer's input placement to equal its producer's chosen +output placement, so the only genuinely free variables are the per-node output +strategy indices ``x_v``. The problem therefore reduces to a pairwise discrete +energy minimization over a DAG:: + + E(x) = Σ_v U_v(x_v) + Σ_{(u,v)} B_{uv}(x_u, x_v) + +where ``U_v`` is the compute cost and ``B_{uv}`` is the communication + +sharding-transition cost on the edge from producer ``u`` to consumer ``v``. + +This is a pairwise MRF. The autograd DAG has small in-degree (<3) but large +out-degree (tens) and a wide topological frontier (hundreds), so exact +frontier/junction-tree DP blows up. We instead solve it with **min-sum belief +propagation** (max-product in min-sum form) on the graph of *coupled groups*, +which propagates coordinated decisions globally, then polish with group-level +coordinate descent and a star-block local search. + +Nodes that must be chosen jointly are merged into groups: repeated-subgraph +cluster copies share a strategy index, and forward/backward pairs share an +output placement. The solver reuses the strategies, decision variables and +constraints already built by ``ShardingOptimizer`` (it replaces only the +CBC/ILP *solve*, not problem construction) and writes its assignment back into +the PuLP variables, so the result is scored with the exact same objective as the +ILP (``pulp.value(prob.objective)``). +""" + +import logging +import math +import time +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Optional + +import numpy as np +import pulp + +from .cost_models.compute_estimation import _get_sharded_shape_stride + +logger = logging.getLogger(__name__) + +INF = float("inf") +BIG = 1e12 # finite stand-in for forbidden combinations (avoids NaN in min-sum) + +# Paired forward/backward constraints couple two nodes to the *same output +# placement* (the strategy index may differ between the two strategy lists). +_PAIRED_PREFIXES = ( + "grad_param_constraint", + "grad_input_constraint", + "grad_output_constraint", +) + + +@dataclass +class ApproximateSolveResult: + objective: float + status: str + build_s: float + solve_s: float + total_s: float + num_groups: int + num_nodes: int + + +@dataclass +class _Group: + """A set of node indices chosen jointly (cluster copies share a strategy + index; forward/backward pairs share an output placement).""" + + members: list[int] + cost_bearing: list[int] = field(default_factory=list) + choices: list[dict[int, int]] = field(default_factory=list) # member -> out_idx + current: int = 0 + + @property + def domain(self) -> int: + return len(self.choices) + + +class _UnionFind: + def __init__(self, n: int): + self.parent = list(range(n)) + + def find(self, x: int) -> int: + root = x + while self.parent[root] != root: + root = self.parent[root] + while self.parent[x] != root: + self.parent[x], x = root, self.parent[x] + return root + + def union(self, a: int, b: int) -> None: + ra, rb = self.find(a), self.find(b) + if ra != rb: + self.parent[rb] = ra + + +class ApproximateShardingSolver: + """Approximate solver for the sharding placement problem on an already-built + :class:`ShardingOptimizer`. + + Call :meth:`get_solution` for a ``{node: OpSpec}`` dict (same format as + ``ShardingOptimizer.get_solution``); it also fills the PuLP variables and + ``optimizer.selected_keys`` so the assignment can be scored/inspected exactly + like an ILP solution. + """ + + def __init__( + self, + optimizer, + candidate_limit: Optional[int] = 64, + bp_iters: int = 20, + bp_damping: float = 0.2, + bp_tol: float = 1e-3, + max_sweeps: int = 12, + max_time_s: float = 60.0, + star_passes: int = 2, + max_star_children: int = 32, + group_domain_limit: int = 512, + ): + self.opt = optimizer + self.candidate_limit = candidate_limit + self.bp_iters = bp_iters + self.bp_damping = bp_damping + self.bp_tol = bp_tol + self.max_sweeps = max_sweeps + self.max_time_s = max_time_s + self.star_passes = star_passes + self.max_star_children = max_star_children + self.group_domain_limit = group_domain_limit + + # Populated by _build_problem(). + self.cost_bearing: list[int] = [] + self.node_mult: dict[int, int] = {} + self.forbidden: set[tuple] = set() + self.allowed_out: dict[int, list[int]] = {} + self.groups: list[_Group] = [] + self.node_to_group: dict[int, int] = {} + self.input_edges: dict[int, list[tuple[int, int]]] = {} + self._arg_prod: dict[int, dict[int, int]] = {} + self.consumers: dict[int, list[tuple[int, int]]] = defaultdict(list) + self.cur_out: dict[int, int] = {} + self._memory: Optional[dict[str, Any]] = None + + # Populated by _build_factors(). + self.g_unary: list[np.ndarray] = [] + self.C: dict[tuple, np.ndarray] = {} + self.nbrs: list[list[int]] = [] + + # ------------------------------------------------------------------ # + # Public entry point + # ------------------------------------------------------------------ # + def get_solution(self, verbose: bool = False): + result, solution = self._solve(verbose=verbose) + self.result = result + return solution + + def _solve(self, verbose: bool = False): + opt = self.opt + if getattr(opt, "solver_backend", "ilp") != "ilp": + raise RuntimeError( + "ApproximateShardingSolver requires an ILP-built optimizer " + "(decision_vars / pulp_variables / constraints)." + ) + t0 = time.perf_counter() + self._build_problem() + t_bp = time.perf_counter() + self._build_factors() + t_bf = time.perf_counter() + t_build = t_bf - t0 + if verbose: + logger.info( + "approx build: problem=%.2fs %s factors=%.2fs groups=%d " + "cost_bearing=%d edges=%d max_domain=%d", + t_bp - t0, getattr(self, "_build_times", {}), t_bf - t_bp, + len(self.groups), len(self.cost_bearing), + sum(len(v) for v in self.input_edges.values()), + max((g.domain for g in self.groups), default=0), + ) + + deadline = t0 + self.max_time_s + # Candidate 1: belief propagation init. + t_bp0 = time.perf_counter() + self._belief_propagation() + if verbose: + logger.info("approx phase: bp converged iter=%s delta=%.4g in %.2fs; " + "bp_decode energy=%.1f", + getattr(self, "_bp_last_iter", None), + getattr(self, "_bp_last_delta", float("nan")), + time.perf_counter() - t_bp0, + self._fast_total_energy()) + self._memory_repair() + self._coordinate_descent(deadline) + if verbose: + logger.info("approx phase: bp+cd energy=%.1f", self._fast_total_energy()) + self._star_block_search(deadline) + bp_energy = self._fast_total_energy() + bp_snapshot = [g.current for g in self.groups] + if verbose: + logger.info("approx phase: bp+cd+star energy=%.1f", bp_energy) + + # Candidate 2: greedy init (cheap insurance against BP doing poorly). + self._greedy_init() + self._memory_repair() + self._coordinate_descent(deadline) + self._star_block_search(deadline) + greedy_energy = self._fast_total_energy() + if verbose: + logger.info("approx phase: greedy+cd+star energy=%.1f", greedy_energy) + + if bp_energy <= greedy_energy: + for gid, ci in enumerate(bp_snapshot): + self._set_group(gid, ci) + t_solve = time.perf_counter() - t0 - t_build + + objective = self._write_back() + total_s = time.perf_counter() - t0 + infeasible = not math.isfinite(objective) + status = "Infeasible" if infeasible else "Heuristic" + result = ApproximateSolveResult( + objective=objective, + status=status, + build_s=t_build, + solve_s=t_solve, + total_s=total_s, + num_groups=len(self.groups), + num_nodes=len(self.cost_bearing), + ) + logger.info( + "ApproximateShardingSolver: status=%s objective=%.4f " + "(bp=%.1f greedy=%.1f) groups=%d nodes=%d " + "timings={build=%.3fs,solve=%.3fs,total=%.3fs}", + status, + objective, + bp_energy, + greedy_energy, + len(self.groups), + len(self.cost_bearing), + t_build, + t_solve, + total_s, + ) + opt.profile["approximate"] = { + "objective": objective, + "status": status, + "build_s": t_build, + "solve_s": t_solve, + "total_s": total_s, + "groups": len(self.groups), + "bp_energy": bp_energy, + "greedy_energy": greedy_energy, + } + if infeasible: + raise RuntimeError( + "ApproximateShardingSolver could not find a feasible assignment. " + "User constraints may be contradictory or the mesh too small." + ) + solution = opt._to_orig_solution(opt._extract_and_validate_solution()) + return result, solution + + # ------------------------------------------------------------------ # + # Problem construction + # ------------------------------------------------------------------ # + def _build_problem(self): + opt = self.opt + cluster_linked = {key[0] for key in opt.cluster_links} + self.cost_bearing = [ + opt.node_map[node] + for node in opt.strats + if node.op != "output" and opt.node_map[node] not in cluster_linked + ] + + root_to_copies: dict[int, set] = defaultdict(set) + for linked_key, root_key in opt.cluster_links.items(): + root_to_copies[root_key[0]].add(linked_key[0]) + self.node_mult = { + v: 1 + len(root_to_copies.get(v, ())) for v in self.cost_bearing + } + + self.allowed_out = {} + for node, strat in opt.strats.items(): + if node.op == "output": + continue + self.allowed_out[opt.node_map[node]] = list(range(len(strat.strategies))) + + t = time.perf_counter() + paired_edges, authoritative = self._parse_constraints() + # Flow edges are taken from the ILP's output_input_consistent constraints + # (the authoritative producer per consumer-arg), NOT from _all_input_nodes: + # the two disagree for some ops (einsum list-args, alias/backward nodes), + # and trusting _all_input_nodes yields flow-infeasible assignments. The + # producer here is the (possibly cluster-resolved) node carrying the + # producer's pulp variable; the ILP guarantees its out_idx range matches + # the consumer's inp_idx range for that arg. + self._arg_prod: dict[int, dict[int, int]] = defaultdict(dict) + flow_couplings = [] # producer sets forced to share an out_idx + for (c_idx, argi), producers in authoritative.items(): + rep = min(producers) # all coupled -> same out, any representative + self._arg_prod[c_idx][argi] = rep + if len(producers) > 1: + flow_couplings.append(producers) + self.input_edges = {} + self.consumers = defaultdict(list) + for v in self.cost_bearing: + edges = sorted(self._arg_prod.get(v, {}).items()) + self.input_edges[v] = edges + for argi, p in edges: + self.consumers[p].append((v, argi)) + t_parse = time.perf_counter() + + # Remove fully-forbidden out_idx for cost-bearing nodes. + for v in self.cost_bearing: + node = opt.nodes[v] + self.allowed_out[v] = [ + o for o in self.allowed_out[v] if not self._out_fully_forbidden(v, node, o) + ] + t_forbid = time.perf_counter() + + self._build_memory_info() # also pins params when the budget is tight + t_mem = time.perf_counter() + self._build_groups(paired_edges, flow_couplings) + t_groups = time.perf_counter() + self._prune_candidates() + self._build_times = { + "parse": t_parse - t, + "forbid": t_forbid - t_parse, + "memory": t_mem - t_forbid, + "groups": t_groups - t_mem, + "prune": time.perf_counter() - t_groups, + } + + # Constraint families that never restrict the per-node out_idx domain and + # are handled structurally (flow/uniqueness) or via the cost sentinel below. + # Skipping them by name avoids materializing items() for the ~majority of the + # (often >100k) constraints. + _SKIP_PREFIXES = ( + "unique_decision", + "same_across_args", + "inf_cases", + "memory_constraint", + ) + + def _parse_constraints(self): + opt = self.opt + # inf-cost keys are forced to 0 by add_inf_cost_constraint, which also + # stamps dv.cost = 10000.0. Detect them directly instead of parsing the + # (very numerous) inf_cases constraints. + for key, dv in opt.decision_vars.items(): + if dv.cost == 10000.0: + self.forbidden.add(key) + + var_to_key = {var: key for key, var in opt.pulp_variables.items()} + restrict: dict[int, set] = {} + paired_edges: list[tuple[int, int, frozenset]] = [] + # (consumer_idx, argi) -> set of producer_idx, from flow constraints. A + # clustered consumer's single inp variable is shared across all its + # copies, so the ILP couples one producer per copy (resolved to its root) + # to that inp, forcing them all equal; we collect the whole set. + authoritative: dict[tuple[int, int], set] = {} + for name, c in opt.prob.constraints.items(): + if name.startswith("output_input_consistent"): + # +side = producer (grouped by out), -side = consumer (grouped by + # inp at a fixed arg). One +var and one -var pin down the edge. + pos_key = neg_key = None + for var, coeff in c.items(): + k = var_to_key.get(var) + if k is None: + continue + if coeff > 0: + pos_key = pos_key or k + else: + neg_key = neg_key or k + if pos_key is not None and neg_key is not None: + break + if pos_key is not None and neg_key is not None: + authoritative.setdefault( + (neg_key[0], neg_key[1]), set() + ).add(pos_key[0]) + continue + if name.startswith(self._SKIP_PREFIXES): + continue + items = list(c.items()) + if not items: + continue + rhs = -c.constant + coeffs = [coeff for _, coeff in items] + keys = [var_to_key.get(var) for var, _ in items] + if any(k is None for k in keys): + continue + all_pos = all(coeff > 0 for coeff in coeffs) + if c.sense == pulp.LpConstraintEQ and rhs == 0 and all_pos: + self.forbidden.update(keys) # Σ vars == 0 (inf / dtype / disable) + elif c.sense == pulp.LpConstraintEQ and rhs == 1 and all_pos: + nodes = {k[0] for k in keys} + if len(nodes) == 1: + n = next(iter(nodes)) + out_set = {k[2] for k in keys} + restrict[n] = restrict.get(n, out_set) & out_set + elif ( + c.sense == pulp.LpConstraintEQ + and rhs == 0 + and any(name.startswith(p) for p in _PAIRED_PREFIXES) + and "disable" not in name + ): + pos = {k for k, coeff in zip(keys, coeffs) if coeff > 0} + neg = {k for k, coeff in zip(keys, coeffs) if coeff < 0} + na, nb = {k[0] for k in neg}, {k[0] for k in pos} + oa, ob = {k[2] for k in neg}, {k[2] for k in pos} + if len(na) == 1 and len(nb) == 1 and len(oa) == 1 and len(ob) == 1: + paired_edges.append( + (next(iter(na)), next(iter(nb)), + frozenset({(next(iter(oa)), next(iter(ob)))})) + ) + for n, out_set in restrict.items(): + if n in self.allowed_out: + self.allowed_out[n] = [o for o in self.allowed_out[n] if o in out_set] + return paired_edges, authoritative + + def _out_fully_forbidden(self, v, node, o): + strat = self.opt.strats[node].strategies[o] + for argi, costs in enumerate(strat.redistribute_cost): + if all((v, argi, o, inp) in self.forbidden for inp in range(len(costs))): + return True + return False + + def _build_groups(self, paired_edges, flow_couplings): + opt = self.opt + n = len(opt.nodes) + uf = _UnionFind(n) + # cluster_links has one entry per option-key; collapse to unique + # (linked_node, root_node) pairs so the K-scaled loops below run over + # hundreds of pairs, not millions of duplicates. + cluster_pairs = {(lk[0], rk[0]) for lk, rk in opt.cluster_links.items()} + for li, ri in cluster_pairs: + uf.union(li, ri) + for a, b, _ in paired_edges: + uf.union(a, b) + + allow: dict[tuple, dict[int, set]] = defaultdict(lambda: defaultdict(set)) + adj: dict[int, set] = defaultdict(set) + for li, ri in cluster_pairs: + for o in self.allowed_out.get(ri, []): + allow[(ri, li)][o].add(o) + for o in self.allowed_out.get(li, []): + allow[(li, ri)][o].add(o) + adj[li].add(ri) + adj[ri].add(li) + for a, b, pairs in paired_edges: + for oa, ob in pairs: + allow[(a, b)][oa].add(ob) + allow[(b, a)][ob].add(oa) + adj[a].add(b) + adj[b].add(a) + # Flow couplings: producers feeding a clustered consumer's shared inp are + # forced to the same out_idx (same-index coupling, star to the rep). + for producers in flow_couplings: + ps = sorted(producers) + rep = ps[0] + for q in ps[1:]: + uf.union(rep, q) + for o in self.allowed_out.get(rep, []): + allow[(rep, q)][o].add(o) + for o in self.allowed_out.get(q, []): + allow[(q, rep)][o].add(o) + adj[rep].add(q) + adj[q].add(rep) + + comps: dict[int, list[int]] = defaultdict(list) + for node in opt.strats: + if node.op == "output": + continue + v = opt.node_map[node] + comps[uf.find(v)].append(v) + + cost_bearing_set = set(self.cost_bearing) + self.groups = [] + self.node_to_group = {} + for members in comps.values(): + members.sort() + group = _Group(members=members) + group.cost_bearing = [m for m in members if m in cost_bearing_set] + group.choices = self._enumerate_choices(members, allow, adj) + if not group.choices: + raise RuntimeError( + f"No feasible joint choice for group {members}; " + "constraints are contradictory." + ) + gid = len(self.groups) + self.groups.append(group) + for m in members: + self.node_to_group[m] = gid + + def _enumerate_choices(self, members, allow, adj): + if len(members) == 1: + v = members[0] + return [{v: o} for o in self.allowed_out.get(v, [])] + member_set = set(members) + # BFS order from a representative so every member after the first is + # adjacent to an already-assigned one; coupling then propagates + # deterministically (no spurious K-way branching that explodes the + # domain for large cluster+paired groups). + order = [] + seen = set() + for start in members: + if start in seen: + continue + queue = [start] + seen.add(start) + while queue: + m = queue.pop(0) + order.append(m) + for nb in adj[m]: + if nb in member_set and nb not in seen: + seen.add(nb) + queue.append(nb) + results: list[dict[int, int]] = [] + limit = self.group_domain_limit + + def candidates(m, assign): + cand = None + for nb in adj[m]: + if nb in assign and nb in member_set: + allowed = allow[(nb, m)].get(assign[nb], set()) + cand = allowed if cand is None else (cand & allowed) + cand = set(self.allowed_out.get(m, [])) if cand is None else ( + cand & set(self.allowed_out.get(m, []))) + return cand + + def dfs(i, assign): + if len(results) >= limit: + return + if i == len(order): + results.append(dict(assign)) + return + m = order[i] + for val in sorted(candidates(m, assign)): + assign[m] = val + dfs(i + 1, assign) + del assign[m] + if len(results) >= limit: + return + + dfs(0, {}) + if len(results) >= limit: + logger.warning( + "Approximate solver: group of %d nodes hit group_domain_limit=%d.", + len(members), limit, + ) + return results + + def _prune_candidates(self): + if self.candidate_limit is None: + return + for group in self.groups: + if len(group.members) != 1 or len(group.choices) <= self.candidate_limit: + continue + v = group.members[0] + node = self.opt.nodes[v] + lbs = sorted( + (self._choice_lower_bound(v, node, c[v]), ci) + for ci, c in enumerate(group.choices) + ) + keep = {ci for _, ci in lbs[: self.candidate_limit]} + group.choices = [group.choices[ci] for ci in sorted(keep)] + + def _choice_lower_bound(self, v, node, o): + opt = self.opt + strat = opt.strats[node].strategies[o] + mult = self.node_mult[v] + lb = opt.decision_vars[(v, 0, o, 0)].compute_cost * len(strat.redistribute_cost) + lb *= mult + for argi, _p in self.input_edges.get(v, []): + best = INF + for inp in range(len(strat.redistribute_cost[argi])): + key = (v, argi, o, inp) + if key in self.forbidden: + continue + dv = opt.decision_vars[key] + best = min(best, dv.comm_cost + dv.sharding_transition_cost) + if math.isfinite(best): + lb += mult * best + return lb + + # ------------------------------------------------------------------ # + # Memory constraint (ratios, budget, tight-budget param pinning) + # ------------------------------------------------------------------ # + def _build_memory_info(self): + opt = self.opt + factors = None + for fname, kwargs in getattr(opt, "_constraint_log", []): + if fname == "add_parameter_memory_constraint": + factors = kwargs + if factors is None: + return + try: + from torch._functorch._aot_autograd.fx_utils import get_param_nodes + + param_nodes = get_param_nodes(opt.graph) + except Exception: + return + + low_f, high_f = factors["memory_factor_low"], factors["memory_factor_high"] + budget_low = budget_high = 0.0 + param_idxs, ratios = [], {} + for node in param_nodes: + v = opt.node_map[node] + param_idxs.append(v) + r = {o: self._param_ratio(v, node, o) for o in self.allowed_out.get(v, [])} + ratios[v] = r + best = min(r.values()) + budget_low += max(best, low_f) + budget_high += max(best, high_f) + + tight = abs(budget_high - budget_low) < 1e-9 + if tight: + # Σ ratio == Σ min(ratio) forces every param to a min-ratio choice. + for v in param_idxs: + r = ratios[v] + mn = min(r.values()) + self.allowed_out[v] = [o for o in self.allowed_out[v] + if r[o] <= mn + 1e-12] + self._memory = { + "param_idxs": param_idxs, + "ratios": ratios, + "budget_low": budget_low, + "budget_high": budget_high, + "tight": tight, + } + + def _param_ratio(self, v, node, o): + spec = self.opt.decision_vars[(v, 0, o, 0)].input_spec + new_shape, _ = _get_sharded_shape_stride(spec) + return math.prod(new_shape) / math.prod(spec.tensor_meta.shape) + + # ------------------------------------------------------------------ # + # Factor graph (numpy unary + pairwise matrices over groups) + # ------------------------------------------------------------------ # + def _build_factors(self): + G = len(self.groups) + # per member, its out_idx across its group's choices + member_vals = [] + for group in self.groups: + mv = {} + for m in group.cost_bearing: + mv[m] = np.array([c[m] for c in group.choices], dtype=np.int64) + # also predecessors that are non-cost-bearing but in this group + for m in group.members: + if m not in mv: + mv[m] = np.array([c[m] for c in group.choices], dtype=np.int64) + member_vals.append(mv) + + self.g_unary = [np.zeros(g.domain) for g in self.groups] + for gid, group in enumerate(self.groups): + for m in group.cost_bearing: + vals = member_vals[gid][m] + self.g_unary[gid] += self.node_mult[m] * self._self_cost_vec(m, vals) + + C: dict[tuple, np.ndarray] = {} + nbr_set: list[set] = [set() for _ in range(G)] + for v in self.cost_bearing: + gv = self.node_to_group[v] + mult = self.node_mult[v] + for argi, p in self.input_edges[v]: + gp = self.node_to_group[p] + R = self._edge_matrix(v, argi, p) # (Kv, Kp) raw, BIG if forbidden + av = member_vals[gv][v] + bp = member_vals[gp][p] + contrib = mult * R[np.ix_(av, bp)] # (D_gv, D_gp) + if gv == gp: + self.g_unary[gv] += np.diagonal(contrib) + else: + a, b = (gv, gp) if gv < gp else (gp, gv) + mat = contrib if gv < gp else contrib.T + if (a, b) in C: + C[(a, b)] += mat + else: + C[(a, b)] = mat.copy() + nbr_set[a].add(b) + nbr_set[b].add(a) + self.C = C + self.nbrs = [sorted(s) for s in nbr_set] + + def _self_cost_vec(self, m, out_indices): + """Vectorized self-cost (compute + producer-less arg costs) for node m + over an array of out_idx.""" + opt = self.opt + node = opt.nodes[m] + prod = self._arg_prod.get(m, {}) + out = np.empty(len(out_indices)) + for i, o in enumerate(out_indices): + strat = opt.strats[node].strategies[o] + n_args = len(strat.redistribute_cost) + dv0 = opt.decision_vars[(m, 0, o, 0)] + c = dv0.compute_cost * n_args + # Args with no flow edge (constructors / None-spec) are scored at + # inp=0 here; args with a producer are charged via the pairwise edges. + for argi in range(n_args): + if argi in prod: + continue + key = (m, argi, o, 0) + if key in self.forbidden: + c = BIG + break + dv = opt.decision_vars[key] + c += dv.comm_cost + dv.sharding_transition_cost + out[i] = c + return out + + def _edge_matrix(self, v, argi, p): + """Raw (Kv, Kp) edge cost matrix R[o_v][o_p] = comm + transition, BIG when + the (o_v, o_p) combination is forbidden. Only entries that can actually be + indexed by the group choices are filled; the rest are BIG.""" + opt = self.opt + Kv = len(opt.strats[opt.nodes[v]].strategies) + Kp = len(opt.strats[opt.nodes[p]].strategies) + R = np.full((Kv, Kp), BIG) + gv = self.node_to_group[v] + gp = self.node_to_group[p] + ov_vals = sorted({c[v] for c in self.groups[gv].choices}) + op_vals = sorted({c[p] for c in self.groups[gp].choices}) + for ov in ov_vals: + for op in op_vals: + key = (v, argi, ov, op) + if key in self.forbidden: + continue + dv = opt.decision_vars[key] + R[ov, op] = dv.comm_cost + dv.sharding_transition_cost + return R + + def _pair_matrix(self, g, h): + """Pairwise cost oriented as (x_g, x_h).""" + if g < h: + return self.C[(g, h)] + return self.C[(h, g)].T + + # ------------------------------------------------------------------ # + # Energy (fast, numpy) + # ------------------------------------------------------------------ # + def _fast_group_energy(self, gid, ci): + e = self.g_unary[gid][ci] + for h in self.nbrs[gid]: + ch = self.groups[h].current + e += self.C[(gid, h)][ci, ch] if gid < h else self.C[(h, gid)][ch, ci] + return e + + def _fast_total_energy(self): + total = 0.0 + for gid, g in enumerate(self.groups): + total += self.g_unary[gid][g.current] + for (a, b), mat in self.C.items(): + total += mat[self.groups[a].current, self.groups[b].current] + return total + + # ------------------------------------------------------------------ # + # Belief propagation (min-sum) + decode + # ------------------------------------------------------------------ # + def _belief_propagation(self): + """Sequential (forward-backward, topological) min-sum message passing. + Exact MAP on trees in one sweep; near-optimal on the near-tree transformer + graph in a few sweeps, far better than synchronous flooding.""" + G = len(self.groups) + unary = self.g_unary + nbrs = self.nbrs + damp = self.bp_damping + + order = sorted(range(G), key=lambda g: min(self.groups[g].members)) + msg: dict[tuple, np.ndarray] = {} + for g in range(G): + for h in nbrs[g]: + msg[(g, h)] = np.zeros(len(unary[h])) + + for sweep in range(self.bp_iters): + max_delta = 0.0 + for direction in (order, order[::-1]): + for g in direction: + if not nbrs[g]: + continue + in_sum = unary[g].copy() + for k in nbrs[g]: + in_sum += msg[(k, g)] + for h in nbrs[g]: + excl = in_sum - msg[(h, g)] + P = self._pair_matrix(g, h) # (D_g, D_h) + m = (excl[:, None] + P).min(axis=0) + m -= m.min() + md = (1 - damp) * m + damp * msg[(g, h)] + delta = np.abs(md - msg[(g, h)]).max() + if delta > max_delta: + max_delta = delta + msg[(g, h)] = md + self._bp_last_iter = sweep + 1 + self._bp_last_delta = max_delta + if max_delta < self.bp_tol: + break + + self._decode(msg) + + def _decode(self, msg): + """Sequential topological decode: fix each group to the argmin of its + belief conditioned on already-decoded neighbors (exact pairwise cost) and + BP messages for the rest. Produces a consistent, forbidden-avoiding + assignment, unlike independent argmin on a loopy graph.""" + G = len(self.groups) + order = sorted(range(G), key=lambda g: min(self.groups[g].members)) + decided: dict[int, int] = {} + for g in order: + b = self.g_unary[g].copy() + for h in self.nbrs[g]: + if h in decided: + b = b + self._pair_matrix(g, h)[:, decided[h]] + else: + b = b + msg[(h, g)] + ci = int(np.argmin(b)) + decided[g] = ci + self._set_group(g, ci) + + # ------------------------------------------------------------------ # + # Local search + # ------------------------------------------------------------------ # + def _set_group(self, gid, ci): + group = self.groups[gid] + group.current = ci + for m, o in group.choices[ci].items(): + self.cur_out[m] = o + + def _greedy_init(self): + order = sorted(range(len(self.groups)), + key=lambda g: min(self.groups[g].members)) + for gid in order: + self._set_group(gid, 0) + for gid in order: + best_i, best_e = 0, INF + for ci in range(self.groups[gid].domain): + e = self.g_unary[gid][ci] + for h in self.nbrs[gid]: + if min(self.groups[h].members) < min(self.groups[gid].members): + ch = self.groups[h].current + e += (self.C[(gid, h)][ci, ch] if gid < h + else self.C[(h, gid)][ch, ci]) + if e < best_e: + best_i, best_e = ci, e + self._set_group(gid, best_i) + + def _coordinate_descent(self, deadline): + for _ in range(self.max_sweeps): + if time.perf_counter() > deadline: + break + improved = False + for gid in range(len(self.groups)): + if self.groups[gid].domain <= 1: + continue + cur = self.groups[gid].current + best_i, best_e = cur, self._fast_group_energy(gid, cur) + for ci in range(self.groups[gid].domain): + if ci == cur: + continue + e = self._fast_group_energy(gid, ci) + if e < best_e - 1e-6 and self._memory_ok_after(gid, ci): + best_i, best_e = ci, e + if best_i != cur: + self._set_group(gid, best_i) + improved = True + if not improved: + break + + def _star_block_search(self, deadline): + ranked = sorted( + ((len(self.nbrs[g]), g) for g in range(len(self.groups)) + if len(self.nbrs[g]) >= 2 and self.groups[g].domain > 1), + reverse=True, + ) + for _ in range(self.star_passes): + if time.perf_counter() > deadline: + break + improved = False + for _deg, gid in ranked: + if time.perf_counter() > deadline: + break + if self._optimize_star(gid): + improved = True + if not improved: + break + + def _optimize_star(self, gid): + children = [h for h in self.nbrs[gid] if self.groups[h].domain > 1] + child_costs = sorted( + ((self._fast_group_energy(h, self.groups[h].current), h) for h in children), + reverse=True, + ) + child_ids = [h for _e, h in child_costs[: self.max_star_children]] + if not child_ids: + return False + block = [gid, *child_ids] + base = self._block_energy(block) + best_energy = base + best_center = self.groups[gid].current + best_children = {h: self.groups[h].current for h in child_ids} + for ci in range(self.groups[gid].domain): + self._set_group(gid, ci) + if not self._memory_ok_after(gid, ci): + continue + chosen = {} + for h in child_ids: + b_i, b_e = self.groups[h].current, INF + for hi in range(self.groups[h].domain): + e = self._fast_group_energy(h, hi) + if e < b_e: + b_i, b_e = hi, e + self._set_group(h, b_i) + chosen[h] = b_i + energy = self._block_energy(block) + if energy < best_energy - 1e-6 and self._block_memory_ok(): + best_energy = energy + best_center = ci + best_children = dict(chosen) + self._set_group(gid, best_center) + for h, hi in best_children.items(): + self._set_group(h, hi) + return best_energy < base - 1e-6 + + def _block_energy(self, gids): + total = 0.0 + seen_edges = set() + for g in gids: + total += self.g_unary[g][self.groups[g].current] + for h in self.nbrs[g]: + key = (g, h) if g < h else (h, g) + if key in seen_edges: + continue + seen_edges.add(key) + a, b = key + total += self.C[key][self.groups[a].current, self.groups[b].current] + return total + + # ------------------------------------------------------------------ # + # Memory repair + # ------------------------------------------------------------------ # + def _current_memory(self): + if self._memory is None: + return 0.0 + return sum(self._memory["ratios"][v][self.cur_out[v]] + for v in self._memory["param_idxs"]) + + def _memory_ok_after(self, gid, ci): + if self._memory is None or self._memory.get("tight"): + return True + ratios = self._memory["ratios"] + choice = self.groups[gid].choices[ci] + delta = sum(ratios[m][o] - ratios[m][self.cur_out[m]] + for m, o in choice.items() if m in ratios) + mem = self._current_memory() + delta + return (self._memory["budget_low"] - 1e-6 <= mem + <= self._memory["budget_high"] + 1e-6) + + def _block_memory_ok(self): + if self._memory is None or self._memory.get("tight"): + return True + mem = self._current_memory() + return (self._memory["budget_low"] - 1e-6 <= mem + <= self._memory["budget_high"] + 1e-6) + + def _memory_repair(self): + if self._memory is None or self._memory.get("tight"): + return + low, high = self._memory["budget_low"], self._memory["budget_high"] + ratios = self._memory["ratios"] + param_groups = {self.node_to_group[v] for v in self._memory["param_idxs"] + if v in self.node_to_group} + for _ in range(2 * max(1, len(param_groups))): + mem = self._current_memory() + if low - 1e-6 <= mem <= high + 1e-6: + return + over = mem > high + best = None + for gid in param_groups: + group = self.groups[gid] + cur_e = self._fast_group_energy(gid, group.current) + for ci in range(group.domain): + if ci == group.current: + continue + choice = group.choices[ci] + dmem = sum(ratios[m][choice[m]] - ratios[m][self.cur_out[m]] + for m in choice if m in ratios) + if (dmem < -1e-9) != over and abs(dmem) > 1e-9: + continue + if abs(dmem) <= 1e-9: + continue + score = (self._fast_group_energy(gid, ci) - cur_e) / abs(dmem) + if best is None or score < best[0]: + best = (score, gid, ci) + if best is None: + logger.warning("Approximate solver: memory repair stuck at %.4f " + "(budget=[%.4f,%.4f]).", mem, low, high) + return + self._set_group(best[1], best[2]) + + # ------------------------------------------------------------------ # + # Write-back + # ------------------------------------------------------------------ # + def total_objective(self): + """Exact objective of the current assignment via decision_vars (for + verification); equals pulp.value(prob.objective) after write-back.""" + total = 0.0 + for v in self.cost_bearing: + node = self.opt.nodes[v] + o = self.cur_out[v] + strat = self.opt.strats[node].strategies[o] + prod = self._arg_prod.get(v, {}) + n_args = len(strat.redistribute_cost) + c = 0.0 + for argi in range(n_args): + p = prod.get(argi) + inp = self.cur_out[p] if p is not None else 0 + key = (v, argi, o, inp) + if key in self.forbidden: + return INF + c += self.opt.decision_vars[key].cost + total += self.node_mult[v] * c + return total + + def _write_back(self): + opt = self.opt + for var in opt.pulp_variables.values(): + var.varValue = 0 + selected = [] + feasible = True + for v in self.cost_bearing: + node = opt.nodes[v] + o = self.cur_out[v] + strat = opt.strats[node].strategies[o] + prod = self._arg_prod.get(v, {}) + for argi in range(len(strat.redistribute_cost)): + p = prod.get(argi) + inp = self.cur_out[p] if p is not None else 0 + key = (v, argi, o, inp) + if key in self.forbidden: + feasible = False + opt.pulp_variables[key].varValue = 1 + selected.append(key) + opt.selected_keys = list(selected) + for rk in selected: + opt.selected_keys.extend(opt._root_to_linked.get(rk, [])) + opt.prob.status = pulp.LpStatusOptimal + opt.prob.sol_status = pulp.LpSolutionOptimal + # Populate prob.objective so callers can score the assignment with + # pulp.value(prob.objective); the returned value uses the equivalent but + # cheaper total_objective() rather than evaluating the full expression. + opt._set_objective() + return INF if not feasible else self.total_objective() diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 06f2a4e6..9e0bf4f5 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -1231,7 +1231,15 @@ def apply_prefetch_discount(self, scale=0.0): # ---- Solution ---- def _set_objective(self): - """Add the cost minimization objective to the ILP.""" + """Add the cost minimization objective to the ILP. + + Idempotent: a no-op if the objective has already been set. This lets the + approximate solver populate ``prob.objective`` (so its assignment can be + scored with ``pulp.value(prob.objective)``) without clobbering or + double-adding it, and keeps repeated get_solution() calls safe. + """ + if self.prob.objective is not None: + return terms = [] for key, dv in self.decision_vars.items(): multiplier = 1 + len(self._root_to_linked.get(key, [])) diff --git a/examples/_bench_approx.py b/examples/_bench_approx.py new file mode 100644 index 00000000..272c47aa --- /dev/null +++ b/examples/_bench_approx.py @@ -0,0 +1,166 @@ +"""Benchmark approximate solver vs ILP: objective + solve time. + +Setting: LLaMA3 (1b default) on a 2D (dp, tp) mesh with vocab parallelism and +the canonical example_llama3 constraints. Both solvers run on the SAME built +optimizer: approx first (it only fills varValues/objective via an idempotent +_set_objective), then a fresh CBC solve for the ILP. This avoids building the +(expensive) strategy graph twice. + +Env knobs: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN, +REPEATED (1|0), RUN_ILP (1|0), ILP_TIMEOUT (seconds, 0=unlimited). +""" +import logging +import os +import time +from unittest.mock import patch + +import pulp +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +_alog = logging.getLogger("autoparallel.approximate_sharding") +_alog.setLevel(logging.INFO) +_alog.addHandler(logging.StreamHandler()) + + +def log(msg): + print(msg, flush=True) + + +_PATCHES = [ + patch("torch.cuda.device_count", lambda: 8), + patch("torch.cuda.get_device_name", lambda *a, **k: "H100"), + patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)), + patch( + "torch.cuda.get_device_properties", + lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132} + )(), + ), +] +for p in _PATCHES: + p.start() + +MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b") +N_LAYERS = int(os.environ.get("N_LAYERS", "0")) +SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4))) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) +REPEATED = os.environ.get("REPEATED", "1") == "1" +RUN_ILP = os.environ.get("RUN_ILP", "1") == "1" +LP_BOUND = os.environ.get("LP_BOUND", "1") == "1" +ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "1200")) + +world_size = 1 +for d in MESH_SHAPE: + world_size *= d + +_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp"), + 4: ("dp", "cp", "tp", "ep")} +mesh_names = _NAMES[len(MESH_SHAPE)] +fake_store = FakeStore() +torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size) +mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", MESH_SHAPE, mesh_dim_names=mesh_names +) + +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +seqlen = SEQLEN + + +def model_fn(): + if MODEL_TYPE == "1b": + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + elif MODEL_TYPE == "8b": + args = TransformerModelArgs( + dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + else: + raise ValueError(MODEL_TYPE) + if N_LAYERS: + args.n_layers = N_LAYERS + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + +log(f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} " + f"world={world_size} seqlen={seqlen} repeated_subgraphs={REPEATED} " + f"ilp_timeout={ILP_TIMEOUT}") + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=REPEATED) +autop.__enter__() +ndim = mesh.ndim +x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1) +# vocab-parallel output only defined for 2D (matches example_llama3); otherwise +# constrain the output like the input. +out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x_sharding]) +autop.add_output_constraints([out_sharding]) +opt = autop.sharding_optimizer +log(f"[build] optimizer ready in {time.perf_counter() - t:.2f}s " + f"vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)} " + f"nodes={len(opt.nodes)}") + +# ---- APPROX ---- +t = time.perf_counter() +approx = ApproximateShardingSolver(opt) +approx.get_solution(verbose=True) +ap_t = time.perf_counter() - t +ap_obj = pulp.value(opt.prob.objective) +prof = opt.profile.get("approximate", {}) +log(f"\n[APPROX] objective={ap_obj:.2f} solve_time={ap_t:.3f}s") +log(f" groups={prof.get('groups')} sweeps={prof.get('sweeps')} " + f"build={prof.get('build_s'):.3f}s search={prof.get('solve_s'):.3f}s " + f"writeback={ap_t - prof.get('build_s', 0) - prof.get('solve_s', 0):.3f}s") + +# ---- LP relaxation lower bound (certified suboptimality upper bound) ---- +if LP_BOUND: + lb_res = opt.get_lower_bound(verbose=False) + lb = lb_res.objective + if lb and lb > 0: + cert = (ap_obj - lb) / lb + log(f"\n[LP-bound] lower_bound={lb:.2f} solve={lb_res.solve_s:.2f}s " + f"=> approx within {cert*100:.2f}% of optimum (certified upper bound)") + +# ---- ILP (fresh CBC solve on the same problem) ---- +if RUN_ILP: + opt._set_objective() # idempotent: objective already populated by approx + kw = {"msg": True} + if ILP_TIMEOUT > 0: + kw["timeLimit"] = ILP_TIMEOUT + log(f"\n[ILP] solving with CBC (timeLimit={ILP_TIMEOUT or 'none'})...") + t = time.perf_counter() + opt.prob.solve(pulp.PULP_CBC_CMD(**kw)) + ilp_t = time.perf_counter() - t + ilp_obj = pulp.value(opt.prob.objective) + status = pulp.LpStatus[opt.prob.status] + log(f"[ILP] objective={ilp_obj:.2f} solve_time={ilp_t:.3f}s status={status}") + + gap = (ap_obj - ilp_obj) / ilp_obj + log(f"\n=== objective gap = {gap*100:+.2f}% solve speedup = {ilp_t/ap_t:.1f}x ===") + log(f"=== within 20% ? {abs(gap) <= 0.20} (ILP status: {status}) ===") diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py new file mode 100644 index 00000000..0bf06688 --- /dev/null +++ b/tests/test_approximate_sharding.py @@ -0,0 +1,140 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import pulp +import pytest +import torch +from conftest import apply_cuda_patches +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor._dtensor_spec import DTensorSpec +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver + + +def _fake_2d_mesh(): + return torch.distributed.device_mesh.init_device_mesh( + "cuda", (4, 2), mesh_dim_names=("dp", "tp") + ) + + +def _tiny_llama3_autop(mesh): + vocab_size = 128 + seq_len = 16 + batch_size = 2 * mesh.shape[0] + model_args = TransformerModelArgs( + dim=64, + n_layers=2, + n_heads=4, + n_kv_heads=2, + vocab_size=vocab_size, + multiple_of=32, + rope_theta=500000, + max_seq_len=seq_len, + ) + with torch.device("meta"): + model = Transformer(model_args) + + def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, reduce_dtype=torch.float32 + ) + return AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True) + + +def _add_constraints(autop, mesh): + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0),) + (Replicate(),) * (mesh.ndim - 1)]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + + +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +@pytest.mark.filterwarnings("ignore:Overwriting previously set objective") +def test_approx_objective_close_to_ilp(): + """The approximate solver should be much faster than the ILP while staying + within a small objective gap on a tiny LLaMA3 block + 2D mesh.""" + mesh = _fake_2d_mesh() + with _tiny_llama3_autop(mesh) as autop: + _add_constraints(autop, mesh) + opt = autop.sharding_optimizer + + autop.optimize_placement(verbose=False, solver="approx") + approx_objective = pulp.value(opt.prob.objective) + # The approx assignment must be ILP-feasible (flow consistency etc.); + # an infeasible assignment can score artificially low and silently pass + # the objective bound below. + violated = [n for n, c in opt.prob.constraints.items() if not c.valid()] + assert not violated, f"approx violated {len(violated)} constraints" + + autop.optimize_placement(verbose=False, solver="ilp") + ilp_objective = pulp.value(opt.prob.objective) + + assert math.isfinite(approx_objective) + assert ilp_objective > 0 + assert approx_objective >= ilp_objective - 1e-6 # ILP is optimal + assert approx_objective <= ilp_objective * 1.20 + 1e-6, ( + f"approx={approx_objective} ilp={ilp_objective} " + f"gap={(approx_objective / ilp_objective - 1) * 100:.1f}%" + ) + + +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +def test_approx_objective_is_faithful(): + """The solver's internal energy must equal the exact ILP objective evaluated + on its assignment (pulp.value), so comparisons against the ILP are valid.""" + mesh = _fake_2d_mesh() + with _tiny_llama3_autop(mesh) as autop: + _add_constraints(autop, mesh) + opt = autop.sharding_optimizer + + solver = ApproximateShardingSolver(opt) + solver.get_solution(verbose=False) + + pulp_objective = pulp.value(opt.prob.objective) + internal_energy = solver.total_objective() + assert math.isfinite(internal_energy) + assert internal_energy == pytest.approx(pulp_objective, rel=1e-6) + # No forbidden decision variable should be selected. + assert all(key not in solver.forbidden for key in opt.selected_keys) + # And every ILP constraint must hold (flow consistency, paired, memory). + violated = [n for n, c in opt.prob.constraints.items() if not c.valid()] + assert not violated, f"approx violated {len(violated)} constraints" + + +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +def test_approx_respects_input_output_constraints(): + """User input/output placement constraints must be honored by the solution.""" + mesh = _fake_2d_mesh() + x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1) + out_sharding = (Shard(0), Shard(2)) + with _tiny_llama3_autop(mesh) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([out_sharding]) + + solution = autop.optimize_placement(verbose=False, solver="approx") + assert solution + + placements = { + spec.placements + for strat in solution.values() + for spec in ( + strat.output_specs + if isinstance(strat.output_specs, (list, tuple)) + else (strat.output_specs,) + ) + if isinstance(spec, DTensorSpec) + } + assert x_sharding in placements + assert out_sharding in placements From d06957fbec8972a651f29bbc5fe8df75856e77bd Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 16:49:23 -0700 Subject: [PATCH 06/27] Speed up optimizer build by skipping PuLP for the approximate solver The optimizer build (strategy enumeration, decision vars, PuLP variables and constraints) dominates end-to-end time, especially on 3D meshes where it constructs ~14M PuLP variables and ~6M constraints that the approximate solver never needs. Two result-preserving changes cut build time: - Hoist the per-node _all_input_nodes / producer-strategy lookups out of the inner decision-var loops (they were recomputed once per decision variable, ~14M times on 3D); this also speeds up the ILP build. - Add ShardingOptimizer(build_pulp=False), selected via AutoParallel(solver="approx"), which skips PuLP variable and constraint construction entirely. The approximate solver then derives the constraint topology directly from the graph + cluster links + constraint log (_topology_direct), verified byte-identical to parsing the PuLP constraints. On LLaMA3 1B the build drops ~2.1x (2D) and ~3.3x (3D, ~13min -> ~4min) with byte-identical placements; 3D end-to-end goes ~17min -> ~5min. test_lite_build_matches_full guards the equivalence. Authored with Claude. --- autoparallel/api.py | 33 ++++- autoparallel/approximate_sharding.py | 185 +++++++++++++++++++++++++-- autoparallel/optimize_sharding.py | 60 ++++++--- tests/test_approximate_sharding.py | 35 ++++- 4 files changed, 274 insertions(+), 39 deletions(-) diff --git a/autoparallel/api.py b/autoparallel/api.py index 907d6111..e5356d1d 100644 --- a/autoparallel/api.py +++ b/autoparallel/api.py @@ -203,8 +203,15 @@ def __init__( dynamic: bool = False, cost_model: Any = "nccl", repeated_subgraphs: bool = True, + solver: str = "ilp", ): self.stack = ExitStack() + # "approx" builds a lighter optimizer (no PuLP variables/constraints), + # which is much faster to construct; optimize_placement(solver="approx") + # then solves it heuristically. "ilp" builds the full PuLP problem. + if solver not in ("ilp", "approx"): + raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'") + self.solver = solver self.fake_mode = ( FakeTensorMode() ) # TODO: maybe need to reuse the model's fake mode @@ -281,6 +288,7 @@ def __enter__(self): self.mesh, force_grad_reduce_in_higher_precision, repeated_subgraphs=self.repeated_subgraphs, + build_pulp=self.solver != "approx", ) self.sharding_optimizer = sharding_optimizer @@ -356,15 +364,19 @@ def add_output_constraints(self, constraints): self.sharding_optimizer.add_sharded_output_constraint(constraints) self.output_constraints = constraints - def optimize_placement(self, verbose=True, solver="ilp", approximate_options=None): + def optimize_placement(self, verbose=True, solver=None, approximate_options=None): """Solve for the optimal placement. - solver="ilp" (default) uses the exact PuLP/CBC solver. solver="approx" - uses the heuristic ApproximateShardingSolver, which trades a small - objective gap for a much faster solve. approximate_options is forwarded - as kwargs to the approximate solver (e.g. candidate_limit, max_sweeps). + solver="ilp" uses the exact PuLP/CBC solver. solver="approx" uses the + heuristic ApproximateShardingSolver, which trades a small objective gap + for a much faster solve. approximate_options is forwarded as kwargs to + the approximate solver (e.g. candidate_limit, max_sweeps). Defaults to the + solver chosen at AutoParallel construction; note an optimizer built with + solver="approx" has no PuLP problem and cannot run the ILP. """ self._assert_entered() + if solver is None: + solver = self.solver if solver in ("approx", "approximate"): from .approximate_sharding import ApproximateShardingSolver @@ -374,6 +386,12 @@ def optimize_placement(self, verbose=True, solver="ilp", approximate_options=Non ) self.sharding_placement = approx.get_solution(verbose=verbose) elif solver == "ilp": + if self.sharding_optimizer.prob is None: + raise RuntimeError( + "solver='ilp' requires a PuLP problem, but this AutoParallel " + "was constructed with solver='approx' (no PuLP built). " + "Construct with solver='ilp' to use the exact solver." + ) self.sharding_placement = self.sharding_optimizer.get_solution( verbose=False ) @@ -394,7 +412,10 @@ def optimize_placement(self, verbose=True, solver="ilp", approximate_options=Non ), ) - if self.sharding_optimizer.prob.status == -1: + if ( + self.sharding_optimizer.prob is not None + and self.sharding_optimizer.prob.status == -1 + ): raise RuntimeError( "The sharding optimizer could not find a feasible solution. " "This typically means the user-specified constraints are " diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py index aba6111c..7e1a945a 100644 --- a/autoparallel/approximate_sharding.py +++ b/autoparallel/approximate_sharding.py @@ -43,6 +43,9 @@ import numpy as np import pulp +import torch +from torch.distributed.tensor._dtensor_spec import DTensorSpec +from torch.distributed.tensor.placement_types import Replicate, Shard from .cost_models.compute_estimation import _get_sharded_shape_stride @@ -293,7 +296,11 @@ def _build_problem(self): self.allowed_out[opt.node_map[node]] = list(range(len(strat.strategies))) t = time.perf_counter() - paired_edges, authoritative = self._parse_constraints() + if opt.prob is None: + # Lite build: no PuLP problem was constructed, derive topology directly. + paired_edges, authoritative = self._topology_direct() + else: + paired_edges, authoritative = self._parse_constraints() # Flow edges are taken from the ILP's output_input_consistent constraints # (the authoritative producer per consumer-arg), NOT from _all_input_nodes: # the two disagree for some ops (einsum list-args, alias/backward nodes), @@ -425,6 +432,159 @@ def _parse_constraints(self): self.allowed_out[n] = [o for o in self.allowed_out[n] if o in out_set] return paired_edges, authoritative + def _topology_direct(self): + """Compute the same topology (forbidden / out_idx restrictions / paired + edges / flow producers) that _parse_constraints extracts, but directly + from the graph + cluster_links + _constraint_log, WITHOUT a PuLP problem. + This lets the optimizer skip building millions of PuLP variables and + constraints when only the approximate solver is used. + + Mirrors ShardingOptimizer.add_inf_cost_constraint / + add_grad_reduce_dtype_constraints / add_forward_backward_consistency_constraints / + _add_paired_output_constraint / add_node_constraint / + add_output_input_consistent_constraint. Verified byte-identical to + _parse_constraints on a full build (see tests).""" + from torch._functorch._aot_autograd.fx_utils import ( + get_param_and_grad_nodes, + get_plain_input_and_grad_nodes, + get_plain_output_and_tangent_nodes, + ) + + opt = self.opt + cl = opt.cluster_links + + def rootkey(k): + return cl.get(k, k) + + cluster_linked = {key[0] for key in cl} + node_root = {} + for lk, rk in cl.items(): + node_root[lk[0]] = rk[0] + + def nroot(idx): + return node_root.get(idx, idx) + + # 1. inf-cost forbidden (== add_inf_cost_constraint). + for key, dv in opt.decision_vars.items(): + if not math.isfinite(dv.cost) or dv.cost == 10000.0: + self.forbidden.add(key) + + # 2. grad-reduce-dtype forbidden (== add_grad_reduce_dtype_constraints). + if getattr(opt, "force_grad_reduce_in_higher_precision", False): + cast_op = torch.ops.autoparallel.dtype_cast.default + pre_cast: set[int] = set() + for param, grad in get_param_and_grad_nodes(opt.graph).values(): + if grad is None: + continue + chain = [grad] + n = grad + while len(n.all_input_nodes) == 1: + parent = n.all_input_nodes[0] + if len(parent.all_input_nodes) != 1: + break + chain.append(parent) + n = parent + cast_idx = next( + (i for i, nd in enumerate(chain) if nd.target == cast_op), None + ) + if cast_idx is None: + continue + for nd in chain[cast_idx:]: + if nd in opt.node_map: + pre_cast.add(opt.node_map[nd]) + for key, dv in opt.decision_vars.items(): + if key[0] in pre_cast and dv.comm_cost > 0: + self.forbidden.add(key) + + # 3. forward/backward paired output constraints + disables + # (== add_forward_backward_consistency_constraints / _add_paired_output_constraint). + paired_edges: list[tuple[int, int, frozenset]] = [] + + def add_paired(node_a, node_b): + idx_a, idx_b = opt.node_map[node_a], opt.node_map[node_b] + strat_a = [str(s.output_specs) for s in opt.strats[node_a].strategies] + strat_b = [str(s.output_specs) for s in opt.strats[node_b].strategies] + num_inp_a = len(opt.strats[node_a].strategies[0].redistribute_cost[0]) + for out_idx, sp in enumerate(strat_a): + if sp not in strat_b: + for inp in range(num_inp_a): + self.forbidden.add(rootkey((idx_a, 0, out_idx, inp))) + continue + out_idx_b = strat_b.index(sp) + ra = rootkey((idx_a, 0, out_idx, 0))[0] + rb = rootkey((idx_b, 0, out_idx_b, 0))[0] + paired_edges.append((ra, rb, frozenset({(out_idx, out_idx_b)}))) + + for param, grad in get_param_and_grad_nodes(opt.graph).values(): + if grad is not None: + add_paired(param, grad) + for node, gnode in get_plain_input_and_grad_nodes(opt.graph).values(): + if gnode is not None: + add_paired(node, gnode) + for node, tnode in get_plain_output_and_tangent_nodes(opt.graph).values(): + if tnode is not None: + add_paired(node, tnode) + + # 4. user node/input/output placement restrictions (== add_node_constraint), + # replayed from _constraint_log. + restrict: dict[int, set] = {} + for fname, kwargs in getattr(opt, "_constraint_log", []): + if fname != "add_node_constraint": + continue + node = next( + (nd for nd in opt.nodes if nd.name == kwargs["node_name"]), None + ) + if node is None or node not in opt.strats: + continue + placement = kwargs["placement"] + if placement is None: + placement = (Shard(0),) + (Replicate(),) * (opt.mesh.ndim - 1) + out_set = set() + for i, s in enumerate(opt.strats[node].strategies): + specs = s.output_specs + if isinstance(specs, DTensorSpec): + if specs.placements == placement: + out_set.add(i) + elif isinstance(specs, (list, tuple)): + for spec in specs: + if isinstance(spec, DTensorSpec): + if spec.placements == placement: + out_set.add(i) + break + r = nroot(opt.node_map[node]) + restrict[r] = restrict.get(r, out_set) & out_set + for n_idx, out_set in restrict.items(): + if n_idx in self.allowed_out: + self.allowed_out[n_idx] = [ + o for o in self.allowed_out[n_idx] if o in out_set + ] + + # 5. flow producers (== add_output_input_consistent_constraint): for each + # consumer-arg, the set of (cluster-resolved) producers feeding it. + authoritative: dict[tuple[int, int], set] = {} + for node in opt.graph.nodes: + if node.op == "output" or node not in opt.node_map: + continue + p_idx = opt.node_map[node] + p_linked = p_idx in cluster_linked + p_root = nroot(p_idx) + for user in node.users: + if user.op == "output" or user not in opt.node_map: + continue + u_idx = opt.node_map[user] + if p_linked and u_idx in cluster_linked: + continue + ain = opt._all_input_nodes(user) + argi = next((i for i, x in enumerate(ain) if x is node), None) + if argi is None: + continue + ispecs = opt.strats[user].strategies[0].input_specs + if argi < len(ispecs) and ispecs[argi] is None: + continue + authoritative.setdefault((nroot(u_idx), argi), set()).add(p_root) + + return paired_edges, authoritative + def _out_fully_forbidden(self, v, node, o): strat = self.opt.strats[node].strategies[o] for argi, costs in enumerate(strat.redistribute_cost): @@ -1029,8 +1189,10 @@ def total_objective(self): def _write_back(self): opt = self.opt - for var in opt.pulp_variables.values(): - var.varValue = 0 + has_pulp = bool(opt.pulp_variables) + if has_pulp: + for var in opt.pulp_variables.values(): + var.varValue = 0 selected = [] feasible = True for v in self.cost_bearing: @@ -1044,15 +1206,18 @@ def _write_back(self): key = (v, argi, o, inp) if key in self.forbidden: feasible = False - opt.pulp_variables[key].varValue = 1 + if has_pulp: + opt.pulp_variables[key].varValue = 1 selected.append(key) opt.selected_keys = list(selected) for rk in selected: opt.selected_keys.extend(opt._root_to_linked.get(rk, [])) - opt.prob.status = pulp.LpStatusOptimal - opt.prob.sol_status = pulp.LpSolutionOptimal - # Populate prob.objective so callers can score the assignment with - # pulp.value(prob.objective); the returned value uses the equivalent but - # cheaper total_objective() rather than evaluating the full expression. - opt._set_objective() + # Populate prob.objective (when a PuLP problem exists) so callers can also + # score via pulp.value(prob.objective); the returned value uses the + # equivalent but cheaper total_objective(). In the lite (no-PuLP) build, + # there is no problem to populate. + if opt.prob is not None: + opt.prob.status = pulp.LpStatusOptimal + opt.prob.sol_status = pulp.LpSolutionOptimal + opt._set_objective() return INF if not feasible else self.total_objective() diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 9e0bf4f5..f66fd4c7 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -281,6 +281,7 @@ def __init__( force_grad_reduce_in_higher_precision=False, repeated_subgraphs=False, solver_backend="ilp", + build_pulp=True, ): self.orig_gm = gm if solver_backend not in {"ilp", "dp"}: @@ -289,6 +290,13 @@ def __init__( "expected 'ilp' or 'dp'" ) self.solver_backend = solver_backend + # When False, skip creating PuLP variables and constraints entirely. + # decision_var costs + strategies + cluster_links are still built, which + # is all the approximate solver needs (it derives the constraint topology + # directly). This avoids constructing millions of PuLP objects on large / + # 3D meshes, where that dominates build time. + self.build_pulp = build_pulp + self.prob = None # The optimizer works on a concretized copy of the graph where all # symbolic shapes are replaced with their concrete hint values. This # centralizes dynamic-shape handling: the optimization pipeline @@ -401,8 +409,9 @@ def __init__( ) self.validate() t2 = time.perf_counter() - self.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize) - self.add_default_constraints() + if self.build_pulp: + self.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize) + self.add_default_constraints() t3 = time.perf_counter() decision_var_build_s = t1 - t0 cost_estimation_s = self._decision_var_profile["cost_estimation_s"] @@ -427,7 +436,7 @@ def __init__( } ) n_unique_vars = len(self.pulp_variables) - n_constraints = len(self.prob.constraints) + n_constraints = len(self.prob.constraints) if self.prob is not None else 0 self.profile["ilp"] = { "unique_variables": n_unique_vars, "logical_decision_variables": self._decision_var_profile[ @@ -819,7 +828,7 @@ def _build_decision_vars(self): """Build DecisionVar entries for every (node_idx, argi, out_idx, inp_idx) combination in the strategy space.""" t_pulp_start = time.perf_counter() - self.pulp_variables = self._create_pulp_variables() + self.pulp_variables = self._create_pulp_variables() if self.build_pulp else {} t_pulp_end = time.perf_counter() # Precompute which node indices are cluster-linked so we can @@ -845,24 +854,30 @@ def _build_decision_vars(self): num_args = len(op_strategy.strategies[0].input_specs) + # Hoisted out of the per-(out_idx, argi, inp_idx) loops: these depend + # only on the node, not on the strategy choice. Recomputing them per + # decision var was O(#vars) calls to _all_input_nodes (a tree_flatten + # each), which dominated build time on large/3D meshes. + all_input_nodes = self._all_input_nodes(node) + producer_strategies = [self.strats[n] for n in all_input_nodes] + pulp_variables = self.pulp_variables + for out_idx, output_strategy in enumerate(op_strategy.strategies): tc0 = time.perf_counter() compute_cost = estimate_strategy_runtime_cost(node, output_strategy) - tc1 = time.perf_counter() - t_compute += tc1 - tc0 + t_compute += time.perf_counter() - tc0 per_arg_compute = compute_cost / num_args + te0 = time.perf_counter() for argi, redist_costs in enumerate(output_strategy.redistribute_cost): + producer_strategy = ( + producer_strategies[argi] + if argi < len(producer_strategies) + else None + ) + input_spec = output_strategy.input_specs[argi] for inp_idx, default_comm_cost in enumerate(redist_costs): key = (node_idx, argi, out_idx, inp_idx) - - all_input_nodes = self._all_input_nodes(node) - producer_strategy = ( - self.strats[all_input_nodes[argi]] - if all_input_nodes - else None - ) - te0 = time.perf_counter() comm_cost, transition_cost = self._compute_edge_costs( node, output_strategy, @@ -871,22 +886,19 @@ def _build_decision_vars(self): default_comm_cost, producer_strategy, ) - te1 = time.perf_counter() - t_edge += te1 - te0 - redist_costs[inp_idx] = comm_cost - decision_vars[key] = DecisionVar( - var=self.pulp_variables[key], + var=pulp_variables[key] if pulp_variables else None, cost=comm_cost + per_arg_compute + transition_cost, compute_cost=per_arg_compute, comm_cost=comm_cost, sharding_transition_cost=transition_cost, strategy=output_strategy, output_spec=output_strategy.output_specs, - input_spec=output_strategy.input_specs[argi], + input_spec=input_spec, ) n_vars += 1 + t_edge += time.perf_counter() - te0 # Batch-copy redistribute_cost from root strats to linked strats. # The root pass above updated redistribute_cost in place with @@ -951,7 +963,7 @@ def _resolve_decision_var(self, key): node_idx, argi, out_idx, _ = key strategy = self.strats[self.nodes[node_idx]].strategies[out_idx] return DecisionVar( - var=self._get_pulp_variable(key), + var=self._get_pulp_variable(key) if self.pulp_variables else None, cost=root_dv.cost, compute_cost=root_dv.compute_cost, comm_cost=root_dv.comm_cost, @@ -1644,6 +1656,8 @@ def _compute_solution_cost(self, solution): # ---- Logging ---- def get_violated_constraints_log(self): + if self.prob is None: + return "Violated constraints: [] (no PuLP problem; lite build)" violated_constraints = [ (k, c) for k, c in self.prob.constraints.items() if not c.valid() ] @@ -2097,6 +2111,8 @@ def add_parameter_memory_constraint( }, ) ) + if self.prob is None: + return # approx solver reads the factors from _constraint_log param_nodes: list[torch.fx.Node] = get_param_nodes(self.graph) elms: list[pulp.LpAffineExpression] = [] budget_low: float = 0.0 @@ -2161,6 +2177,8 @@ def add_node_constraint(self, node, placement=None, constraint_name=None): raise RuntimeError( f"Couldn't find appropriate constraint {node} {constraint_name} {placement}" ) + if self.prob is None: + return [] # approx solver replays this from _constraint_log return self._add_node_constraint( node, output_constraint_indices=output_constraint_indices, diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py index 0bf06688..0383fad8 100644 --- a/tests/test_approximate_sharding.py +++ b/tests/test_approximate_sharding.py @@ -24,7 +24,7 @@ def _fake_2d_mesh(): ) -def _tiny_llama3_autop(mesh): +def _tiny_llama3_autop(mesh, solver="ilp"): vocab_size = 128 seq_len = 16 batch_size = 2 * mesh.shape[0] @@ -47,7 +47,9 @@ def input_fn(): mp_policy = MixedPrecisionPolicy( param_dtype=torch.bfloat16, reduce_dtype=torch.float32 ) - return AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True) + return AutoParallel( + model, input_fn, mesh, mp_policy, repeated_subgraphs=True, solver=solver + ) def _add_constraints(autop, mesh): @@ -138,3 +140,32 @@ def test_approx_respects_input_output_constraints(): } assert x_sharding in placements assert out_sharding in placements + + +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +def test_lite_build_matches_full(): + """Building with solver="approx" skips PuLP variables/constraints (faster + setup); the resulting assignment must be byte-identical to running the + approximate solver on a full PuLP build.""" + mesh = _fake_2d_mesh() + + with _tiny_llama3_autop(mesh, solver="ilp") as autop: + _add_constraints(autop, mesh) + assert autop.sharding_optimizer.prob is not None + autop.optimize_placement(verbose=False, solver="approx") + obj_full = autop.sharding_optimizer.profile["approximate"]["objective"] + keys_full = set(autop.sharding_optimizer.selected_keys) + + with _tiny_llama3_autop(mesh, solver="approx") as autop: + _add_constraints(autop, mesh) + # Lite build: no PuLP problem or variables were constructed. + assert autop.sharding_optimizer.prob is None + assert not autop.sharding_optimizer.pulp_variables + solution = autop.optimize_placement(verbose=False) + obj_lite = autop.sharding_optimizer.profile["approximate"]["objective"] + keys_lite = set(autop.sharding_optimizer.selected_keys) + assert solution + + assert obj_lite == pytest.approx(obj_full, rel=1e-9) + assert keys_lite == keys_full From 17fdb4e117e4b1291696c2762174969eabcec02c Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 16:51:42 -0700 Subject: [PATCH 07/27] Prune invalid sharding strategies and skip CBC integer preprocessing The sharding ILP's LP relaxation is naturally integral, so CBC reaches the optimum at the root with zero branch-and-bound. The solve time was dominated by CBC's integer preprocessing churning through hundreds of thousands of binary columns, ~30% of which are invalid (infinite-cost) strategy edges the optimizer materialized only to immediately constrain to zero. To review, start with optimize_sharding.py: _build_decision_vars now computes each edge's cost up front and only creates a variable when it is finite, recording the survivors in _valid_keys. The constraint builders and _create_pulp_variables tolerate the pruned keys (a missing key is an empty, i.e. zero, term), the same-output and flow constraints key explicitly by output index instead of relying on positional alignment, and add_inf_cost_constraint becomes a no-op for fresh builds. _solve then passes "preprocess off" to CBC. serialization.py seeds _valid_keys on load so saved optimizers match freshly built ones, and test_optimize_placement.py adds a regression test for the invariant. On LLaMA3-1B with a 2D mesh this drops the problem from 476176 to 335390 variables and 173442 to 29643 constraints, and the solve from ~66s to ~11s, with the objective unchanged (48449.3483). Authored with Claude. --- autoparallel/optimize_sharding.py | 190 ++++++++++++++++++++++-------- autoparallel/serialization.py | 26 ++-- tests/test_optimize_placement.py | 35 ++++++ 3 files changed, 192 insertions(+), 59 deletions(-) diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 6b72878b..76d9b3c4 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -246,6 +246,10 @@ def __init__( # remove_constraints can keep this in sync. self._node_constraint_names: dict[str, str] = {} self._name_counters: dict[str, int] = {} + # Set by _build_decision_vars: the (node, arg, out, inp) keys whose + # strategy edge has finite cost. Invalid (infinite-cost) edges are + # pruned and get no variable. None means "no pruning filter". + self._valid_keys: set[tuple] | None = None t0 = time.perf_counter() self.strats = self.build_sharding_metadata() # nodes/node_map are derived from strats (not graph.nodes) so that @@ -416,6 +420,12 @@ def _create_pulp_variables(self): Returns a dict mapping root (node_idx, argi, out_idx, inp_idx) keys to their PuLP variables. Linked keys are not stored; use _get_pulp_variable() to resolve them through cluster_links. + + Keys whose strategy is invalid (infinite cost) are pruned: if + self._valid_keys is set, only those keys get a variable. These + variables would otherwise be forced to zero by an inf-cost + constraint, so skipping them shrinks the ILP without changing the + optimum (see _build_decision_vars). """ cluster_linked_node_idxs = {key[0] for key in self.cluster_links} @@ -428,6 +438,8 @@ def _create_pulp_variables(self): continue for argi, out_idx, inp_idx in self.walk_over_options(node): key = (node_idx, argi, out_idx, inp_idx) + if self._valid_keys is not None and key not in self._valid_keys: + continue root_node = self.nodes[node_idx] pulp_variables[key] = pulp.LpVariable( f"n={root_node},s={node_idx},arg={argi}," @@ -439,9 +451,12 @@ def _create_pulp_variables(self): def _get_pulp_variable(self, key): """Look up the PuLP variable for a key, resolving through - cluster_links if the key belongs to a linked node.""" + cluster_links if the key belongs to a linked node. + + Returns None if the key was pruned (invalid/infinite-cost strategy). + """ root_key = self.cluster_links.get(key, key) - return self.pulp_variables[root_key] + return self.pulp_variables.get(root_key) def _compute_edge_costs( self, @@ -480,26 +495,33 @@ def _compute_edge_costs( def _build_decision_vars(self): """Build DecisionVar entries for every (node_idx, argi, out_idx, inp_idx) - combination in the strategy space.""" - t_pulp_start = time.perf_counter() - self.pulp_variables = self._create_pulp_variables() - t_pulp_end = time.perf_counter() + combination in the strategy space. + Strategy edges whose total cost is infinite (invalid redistributions) + are pruned: no variable is created for them. Such a variable would be + forced to zero by an inf-cost constraint anyway, so dropping it leaves + the optimum unchanged while removing ~30% of the variables and the + corresponding ~80% of constraints that are pure ``var == 0`` bounds. + """ # Precompute which node indices are cluster-linked so we can # copy costs from the root instead of recomputing them. self._cluster_linked_node_idxs = {key[0] for key in self.cluster_links} t_compute = 0.0 t_edge = 0.0 - n_vars = 0 + n_pruned = 0 n_cluster_copied = 0 + t_pulp_start = time.perf_counter() + self.pulp_variables = {} + self._valid_keys: set[tuple] = set() decision_vars = {} strats_items = [ (self.node_map[node], node, strat) for node, strat in self.strats.items() ] - # Build DVs for root nodes only (not cluster-linked). + # Build DVs for root nodes only (not cluster-linked). Compute the edge + # cost first and only materialize a variable when it is finite. for node_idx, node, op_strategy in strats_items: if node.op == "output": continue @@ -507,6 +529,7 @@ def _build_decision_vars(self): continue num_args = len(op_strategy.strategies[0].input_specs) + all_input_nodes = self._all_input_nodes(node) for out_idx, output_strategy in enumerate(op_strategy.strategies): tc0 = time.perf_counter() @@ -516,15 +539,10 @@ def _build_decision_vars(self): per_arg_compute = compute_cost / num_args for argi, redist_costs in enumerate(output_strategy.redistribute_cost): + producer_strategy = ( + self.strats[all_input_nodes[argi]] if all_input_nodes else None + ) for inp_idx, default_comm_cost in enumerate(redist_costs): - key = (node_idx, argi, out_idx, inp_idx) - - all_input_nodes = self._all_input_nodes(node) - producer_strategy = ( - self.strats[all_input_nodes[argi]] - if all_input_nodes - else None - ) te0 = time.perf_counter() comm_cost, transition_cost = self._compute_edge_costs( node, @@ -539,9 +557,22 @@ def _build_decision_vars(self): redist_costs[inp_idx] = comm_cost + cost = comm_cost + per_arg_compute + transition_cost + if not math.isfinite(cost): + n_pruned += 1 + continue + + key = (node_idx, argi, out_idx, inp_idx) + var = pulp.LpVariable( + f"n={node},s={node_idx},arg={argi}," + f"output_p={out_idx},input_p={inp_idx}", + cat=pulp.LpBinary, + ) + self.pulp_variables[key] = var + self._valid_keys.add(key) decision_vars[key] = DecisionVar( - var=self.pulp_variables[key], - cost=comm_cost + per_arg_compute + transition_cost, + var=var, + cost=cost, compute_cost=per_arg_compute, comm_cost=comm_cost, sharding_transition_cost=transition_cost, @@ -549,7 +580,6 @@ def _build_decision_vars(self): output_spec=output_strategy.output_specs, input_spec=output_strategy.input_specs[argi], ) - n_vars += 1 # Batch-copy redistribute_cost from root strats to linked strats. # The root pass above updated redistribute_cost in place with @@ -570,16 +600,20 @@ def _build_decision_vars(self): list(costs) for costs in root_spec.redistribute_cost ] n_cluster_copied = len(self.cluster_links) - n_vars += n_cluster_copied + # Linked keys mirror their root's validity (redistribute_cost is copied + # from the root above), so only valid root keys map to linked keys. self._root_to_linked: dict[tuple, list[tuple]] = defaultdict(list) for linked_key, root_key in self.cluster_links.items(): - self._root_to_linked[root_key].append(linked_key) + if root_key in self._valid_keys: + self._root_to_linked[root_key].append(linked_key) + t_pulp_end = time.perf_counter() logger.debug( - "_build_decision_vars breakdown (%d vars, %d cluster-copied): " - "pulp_vars=%.3fs, compute_cost=%.3fs, edge_cost=%.3fs", - n_vars, + "_build_decision_vars breakdown (%d vars, %d pruned-inf, %d cluster-copied): " + "build=%.3fs, compute_cost=%.3fs, edge_cost=%.3fs", + len(decision_vars), + n_pruned, n_cluster_copied, t_pulp_end - t_pulp_start, t_compute, @@ -607,6 +641,24 @@ def _resolve_decision_var(self, key): input_spec=strategy.input_specs[argi], ) + def _find_decision_var(self, node_idx, argi, out_idx): + """Return a DecisionVar for any surviving inp_idx of (node, arg, out), + or None if every edge for that output strategy was pruned. + + compute_cost is identical across inp_idx for a given out_idx, so callers + that only need per-strategy costs can use whichever edge survived. + """ + strategy = self.strats[self.nodes[node_idx]].strategies[out_idx] + n_inp = len(strategy.redistribute_cost[argi]) if strategy.redistribute_cost else 1 + for inp_idx in range(n_inp): + key = (node_idx, argi, out_idx, inp_idx) + if key in self.decision_vars: + return self._resolve_decision_var(key) + root_key = self.cluster_links.get(key) + if root_key is not None and root_key in self.decision_vars: + return self._resolve_decision_var(key) + return None + def _collect_vars(self, node, node_idx, argi, group_by, resolve_clusters=False): """Collect PuLP variables for a node's options, grouped by strategy index. @@ -622,9 +674,11 @@ def _collect_vars(self, node, node_idx, argi, group_by, resolve_clusters=False): if key in self.cluster_links: if not resolve_clusters: continue - var = self.pulp_variables[self.cluster_links[key]] + var = self.pulp_variables.get(self.cluster_links[key]) else: - var = self.pulp_variables[key] + var = self.pulp_variables.get(key) + if var is None: # pruned (invalid/infinite-cost) strategy edge + continue group_key = out_idx if group_by == "out_idx" else inp_idx result.setdefault(group_key, []).append(var) return result @@ -679,7 +733,9 @@ def add_unique_decision_constraint(self): arg_vars = {} for argi, out_idx, inp_idx in self.walk_over_options(node): key = (node_idx, argi, out_idx, inp_idx) - var = self.pulp_variables[key] + var = self.pulp_variables.get(key) + if var is None: # pruned (invalid) strategy edge + continue arg_vars.setdefault(argi, []).append(var) for eqs in arg_vars.values(): self.prob += ( @@ -703,20 +759,24 @@ def add_same_output_across_args_constraint(self): continue if len(self._all_input_nodes(node)) <= 1: continue - vars_per_output = {} + # Group vars by (argi, out_idx). Pruning can leave an arg with no + # vars for a given out_idx, so we key explicitly by out_idx rather + # than relying on positional alignment: a missing entry means an + # empty sum (== 0), which correctly forbids that output strategy. + num_args = len(self._all_input_nodes(node)) + vars_per_output: dict[tuple[int, int], list] = {} for argi, out_idx, inp_idx in self.walk_over_options(node): key = (node_idx, argi, out_idx, inp_idx) - var = self.pulp_variables[key] + var = self.pulp_variables.get(key) + if var is None: # pruned (invalid) strategy edge + continue vars_per_output.setdefault((argi, out_idx), []).append(var) - eqs_per_arg = [[] for _ in self._all_input_nodes(node)] - for (argi, out_idx), value in vars_per_output.items(): - eqs_per_arg[argi].append(pulp.lpSum(value)) - arg0 = eqs_per_arg[0] - for arg_eqs in eqs_per_arg[1:]: - assert len(arg0) == len(arg_eqs) - for i in range(len(arg0)): + all_out_idxs = {oi for (_, oi) in vars_per_output} + for out_idx in all_out_idxs: + arg0_eq = pulp.lpSum(vars_per_output.get((0, out_idx), [])) + for argi in range(1, num_args): self.prob += ( - arg0[i] == arg_eqs[i], + arg0_eq == pulp.lpSum(vars_per_output.get((argi, out_idx), [])), self._get_next_name("same_across_args"), ) @@ -790,13 +850,15 @@ def add_output_input_consistent_constraint(self): ) continue - assert ( - vars_producer.keys() == vars_consumer.keys() - ), f"{vars_producer}, {vars_consumer}" - - for k in vars_producer: + # Pruning can leave a producer output strategy with no matching + # consumer var (the consumer cannot accept that placement) or + # vice versa. Iterate the union and treat a missing side as an + # empty sum (== 0): this forbids the unmatched output strategy, + # exactly as the old inf-cost (== 0) variables did. + for k in vars_producer.keys() | vars_consumer.keys(): self.prob += ( - pulp.lpSum(vars_producer[k]) == pulp.lpSum(vars_consumer[k]), + pulp.lpSum(vars_producer.get(k, [])) + == pulp.lpSum(vars_consumer.get(k, [])), self._get_next_name("output_input_consistent"), ) @@ -805,6 +867,11 @@ def add_inf_cost_constraint(self): are forced to zero. ∀i,a,o,j: c_{i,a,o,j} = ∞ ⟹ x_{i,a,o,j} = 0 + + Freshly built optimizers prune these edges in _build_decision_vars, so + no variable exists and this is a no-op. It still runs for optimizers + loaded from save files produced before pruning was introduced, whose + decision_vars may still contain infinite-cost entries. """ for key, dv in self.decision_vars.items(): if not math.isfinite(dv.cost): @@ -886,7 +953,16 @@ def _set_objective(self): def _solve(self, verbose=False): self._apply_memory_constraint() - solver = pulp.PULP_CBC_CMD(msg=verbose) + # The sharding ILP has a near-totally-unimodular (flow-like) structure: + # CBC's LP relaxation is naturally integral, so it solves in seconds + # with zero branch-and-bound. CBC's integer *preprocessing* (probing, + # substitutions over hundreds of thousands of binary columns) is then + # pure overhead — it dominates the solve. Disabling it (correctness is + # unaffected; CBC still does full branch-and-bound if the relaxation is + # fractional) makes the solve ~10x faster on large graphs. + # Pass as a single string: PuLP prefixes each options entry with "-", + # so this becomes the CBC flag "-preprocess off". + solver = pulp.PULP_CBC_CMD(msg=verbose, options=["preprocess off"]) # Use a dedicated temp directory for PuLP's intermediate files (.mps, # .sol, etc.) so they are always cleaned up, even if the process is # killed. Without this, leftover files can fill up /tmp (tmpfs). @@ -1072,8 +1148,12 @@ def _compute_solution_cost(self, solution): # Use pre-computed costs from decision vars instead of # estimate_strategy_runtime_cost, which needs node.meta["val"] - # (absent on loaded optimizers). - dv = self._resolve_decision_var((node_idx, 0, out_idx, 0)) + # (absent on loaded optimizers). The (.,0,out_idx,0) edge may be + # pruned, so find any surviving inp_idx for arg 0 (compute_cost is + # identical across inp_idx for a given out_idx). + dv = self._find_decision_var(node_idx, 0, out_idx) + if dv is None: + continue num_args = max(len(strategy.input_specs), 1) total_compute += dv.compute_cost * num_args @@ -1408,6 +1488,8 @@ def _add_node_constraint( for argi, out_idx, inp_idx in self.walk_over_options(node): if out_idx in output_constraint_indices: var = self._get_pulp_variable((node_idx, argi, out_idx, inp_idx)) + if var is None: # pruned (invalid) strategy edge + continue vars_per_arg.setdefault(argi, []).append(var) names = [] for eqs in vars_per_arg.values(): @@ -1435,8 +1517,10 @@ def _add_paired_output_constraint(self, node_a, node_b, constraint_name): # This placement exists in node_a but not in node_b. # Disable it: force sum of its decision variables to 0. v_a = [ - self._get_pulp_variable((idx_a, 0, out_idx, inp_idx)) + v for inp_idx in range(num_inp_a) + if (v := self._get_pulp_variable((idx_a, 0, out_idx, inp_idx))) + is not None ] self.prob += ( pulp.lpSum(v_a) == 0, @@ -1445,12 +1529,16 @@ def _add_paired_output_constraint(self, node_a, node_b, constraint_name): continue out_idx_b = strat_b.index(sp) v_a = [ - self._get_pulp_variable((idx_a, 0, out_idx, inp_idx)) + v for inp_idx in range(num_inp_a) + if (v := self._get_pulp_variable((idx_a, 0, out_idx, inp_idx))) + is not None ] v_b = [ - self._get_pulp_variable((idx_b, 0, out_idx_b, inp_idx)) + v for inp_idx in range(num_inp_b) + if (v := self._get_pulp_variable((idx_b, 0, out_idx_b, inp_idx))) + is not None ] self.prob += ( pulp.lpSum(v_b) == pulp.lpSum(v_a), @@ -1653,7 +1741,9 @@ def _apply_memory_constraint(self): num_out_strat = len(self.strats[node].strategies) ratios: list[float] = [] for out_idx in range(num_out_strat): - dv = self._resolve_decision_var((node_idx, 0, out_idx, 0)) + dv = self._find_decision_var(node_idx, 0, out_idx) + if dv is None: # every edge for this strategy was pruned + continue spec: DTensorSpec = dv.input_spec assert spec.tensor_meta is not None tensor_shape: torch.Size = spec.tensor_meta.shape diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py index 46cb3fde..af85ab56 100644 --- a/autoparallel/serialization.py +++ b/autoparallel/serialization.py @@ -286,6 +286,19 @@ def load_optimizer(cls, path): # for add_node_constraint() default placement, without needing a PG opt.mesh = _MeshPlaceholder(save_dict["mesh_shape"], save_dict["mesh_dim_names"]) + # Map saved decision-var keys to loaded node indices. Only these keys had + # a finite-cost (valid) strategy edge at save time; invalid edges were + # pruned and must not get a variable, so seed _valid_keys before creating + # the PuLP variables (see ShardingOptimizer._build_decision_vars). + save_node_names = save_dict["dv_costs_node_names"] + keys_t = save_dict["dv_costs_keys"].tolist() + vals_t = save_dict["dv_costs_vals"].tolist() + mapped_keys = [ + (opt.node_map[nodes_by_name[save_node_names[k[0]]]], k[1], k[2], k[3]) + for k in keys_t + ] + opt._valid_keys = set(mapped_keys) + # Rebuild PuLP variables and decision vars from saved costs. t2 = time.perf_counter() opt.pulp_variables = opt._create_pulp_variables() @@ -296,19 +309,14 @@ def load_optimizer(cls, path): len(opt.pulp_variables), ) # Reconstruct decision_vars from compact tensors. - save_node_names = save_dict["dv_costs_node_names"] - keys_t = save_dict["dv_costs_keys"].tolist() - vals_t = save_dict["dv_costs_vals"].tolist() opt.decision_vars = {} - for (save_node_idx, argi, out_idx, inp_idx), ( + for key, ( compute_cost, comm_cost, transition_cost, - ) in zip(keys_t, vals_t): - node_name = save_node_names[save_node_idx] - node = nodes_by_name[node_name] - node_idx = opt.node_map[node] - key = (node_idx, argi, out_idx, inp_idx) + ) in zip(mapped_keys, vals_t): + node_idx, argi, out_idx, inp_idx = key + node = opt.nodes[node_idx] strategy = opt.strats[node].strategies[out_idx] opt.decision_vars[key] = DecisionVar( var=opt.pulp_variables[key], diff --git a/tests/test_optimize_placement.py b/tests/test_optimize_placement.py index 59a4cf7c..9325f1f5 100644 --- a/tests/test_optimize_placement.py +++ b/tests/test_optimize_placement.py @@ -841,3 +841,38 @@ def input_fn(): # With memory budget enforced and no node constraint, the optimizer # should shard this param again assert solution[orig_node].output_specs.placements == (Shard(0),) + + +@apply_cuda_patches +def test_invalid_strategies_are_pruned(device_mesh_2d): + """Infinite-cost (invalid) strategy edges must not be materialized as + variables or constraints, and pruning them must not change the optimum.""" + import math + + mesh = device_mesh_2d + model_fn, input_fn = _make_model_and_input_fn(mesh, "transformer_block") + with torch.device("meta"): + model = model_fn() + + with AutoParallel(model, input_fn, mesh) as autop: + autop.add_input_constraints([(Shard(0), Replicate())]) + autop.add_output_constraints([(Shard(0), Replicate())]) + autop.add_parameter_memory_constraint(low=None, high=None) + opt = autop.sharding_optimizer + + # Invariant: every materialized decision var is finite-cost, and the + # PuLP variable set is exactly the set of valid (finite) keys. + assert all(math.isfinite(dv.cost) for dv in opt.decision_vars.values()) + assert set(opt.pulp_variables) == opt._valid_keys + assert all(k in opt._valid_keys for k in opt.decision_vars) + + # No inf-cost (== 0) constraints should be emitted any more. + assert not any( + name.startswith("inf_cases") for name in opt.prob.constraints + ) + + # The pruned problem must still solve to a valid solution. + solution = autop.optimize_placement() + param_nodes = get_param_nodes(autop.gm.graph) + for node in param_nodes: + assert node in solution From 238443e3db2ab487bf6cdb9b104b446a0ac82f72 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 16:53:38 -0700 Subject: [PATCH 08/27] Add sharding annotations with Shardy-like propagation to shrink the ILP search space Users can express a tensor-parallel plan as a few annotations and have it propagated through the graph, turning the unambiguous part into ILP constraints while leaving the genuine cost tradeoffs (FSDP/data axis, residual sequence-parallelism, collective placement) to the solver. Review in this order: propagation.py introduces the propagation engine (per-mesh-axis, reshard-free, worklist fixpoint with priority rounds, pinning only Shard placements so the optimum stays reachable); optimize_sharding.py adds the primitives it emits -- per-axis node constraints (add_node_axis_constraint, with method="fix" that prunes decision variables instead of adding equality rows), memory-budget awareness of per-axis-pinned params, and solve_lp_relaxation for diagnosing/short-circuiting the solve; api.py exposes the user-facing annotate_* and propagate_annotations entry points. Then tests, example, and docs. On LLaMA3-1B (2D mesh) the annotated path reaches the same objective as the full ILP on a ~36% smaller search space and solves faster. The LP relaxation is integral on this problem, so solve_lp_relaxation(extract=True) gives an even larger, exact speedup. Authored with Claude. --- autoparallel/api.py | 162 +++++++++ autoparallel/optimize_sharding.py | 172 +++++++++- autoparallel/propagation.py | 487 +++++++++++++++++++++++++++ docs/README.md | 1 + docs/sharding_annotations.md | 183 ++++++++++ examples/example_llama3_annotated.py | 145 ++++++++ tests/test_propagation.py | 222 ++++++++++++ 7 files changed, 1371 insertions(+), 1 deletion(-) create mode 100644 autoparallel/propagation.py create mode 100644 docs/sharding_annotations.md create mode 100644 examples/example_llama3_annotated.py create mode 100644 tests/test_propagation.py diff --git a/autoparallel/api.py b/autoparallel/api.py index 1670d509..4fcb9ef5 100644 --- a/autoparallel/api.py +++ b/autoparallel/api.py @@ -44,6 +44,7 @@ ) from .module_construction import make_parallel_module from .optimize_sharding import ShardingOptimizer +from .propagation import ShardingAnnotation, ShardingPropagator from .shardings.placement_options import _get_device_from_mesh from .tracing import ( _add_unused_params_and_buffers, @@ -287,6 +288,8 @@ def __enter__(self): self.input_constraints = None self.output_constraints = None + self._annotations: list[tuple[Any, ShardingAnnotation]] = [] + self.propagation_result = None self.active = True @@ -356,6 +359,165 @@ def add_output_constraints(self, constraints): self.sharding_optimizer.add_sharded_output_constraint(constraints) self.output_constraints = constraints + # ---- Sharding annotations (Shardy-like propagation) ---- + + def _normalize_placements(self, placements): + """Pad/validate a placement tuple to mesh.ndim, leaving missing trailing + axes open (``None``).""" + placements = tuple(placements) + if len(placements) > self.mesh.ndim: + raise ValueError( + f"annotation has {len(placements)} placements but mesh has " + f"{self.mesh.ndim} dims" + ) + return placements + (None,) * (self.mesh.ndim - len(placements)) + + def _param_fqn_to_node(self): + from torch._functorch._aot_autograd.fx_utils import get_param_and_grad_nodes + + graph = self.sharding_optimizer.graph + return { + desc.target: node + for desc, (node, _grad) in get_param_and_grad_nodes(graph).items() + } + + def annotate_parameter(self, fqn, placements, priority=1): + """Annotate the sharding of one or more parameters. + + ``fqn`` is a parameter fully-qualified name, optionally a glob pattern + (e.g. ``"layers.*.attention.wq.weight"``) to annotate the matching + parameter in every layer at once. ``placements`` is a tuple of + :class:`Placement` (or ``None`` to leave a mesh axis open — typical for + the data/FSDP axis of a weight). Weights default to a lower priority + than activations so the data-parallel axis wins shared-axis conflicts. + """ + import fnmatch + + placements = self._normalize_placements(placements) + fqn_map = self._param_fqn_to_node() + matched = [node for name, node in fqn_map.items() if fnmatch.fnmatch(name, fqn)] + if not matched: + raise ValueError( + f"No parameter matches {fqn!r}. Available parameters: " + f"{sorted(fqn_map)}" + ) + for node in matched: + self._annotations.append((node, ShardingAnnotation(placements, priority))) + return matched + + def annotate_input(self, idx, placements, priority=0): + """Annotate the sharding of graph input ``idx``.""" + from torch._functorch._aot_autograd.fx_utils import ( + get_plain_input_and_grad_nodes, + ) + + placements = self._normalize_placements(placements) + graph = self.sharding_optimizer.graph + nodes = { + desc.idx: node + for desc, (node, _grad) in get_plain_input_and_grad_nodes(graph).items() + } + if idx not in nodes: + raise ValueError(f"No graph input with index {idx}; have {sorted(nodes)}") + self._annotations.append((nodes[idx], ShardingAnnotation(placements, priority))) + return nodes[idx] + + def annotate_output(self, idx, placements, priority=0): + """Annotate the sharding of graph output ``idx``.""" + from torch._functorch._aot_autograd.fx_utils import ( + get_plain_output_and_tangent_nodes, + ) + + placements = self._normalize_placements(placements) + graph = self.sharding_optimizer.graph + nodes = { + desc.idx: node + for desc, (node, _t) in get_plain_output_and_tangent_nodes(graph).items() + } + if idx not in nodes: + raise ValueError(f"No graph output with index {idx}; have {sorted(nodes)}") + self._annotations.append((nodes[idx], ShardingAnnotation(placements, priority))) + return nodes[idx] + + def annotate_node(self, node, placements, priority=0): + """Annotate the sharding of an arbitrary graph node.""" + placements = self._normalize_placements(placements) + self._annotations.append((node, ShardingAnnotation(placements, priority))) + return node + + def _mirror_annotations_to_backward(self): + """Build extra propagation seeds on the backward twins of annotated + forward tensors. + + A gradient shares the sharding of the value it is the gradient of, so a + forward annotation also pins its twin (parameter->grad, input->grad, + output->tangent). Seeding the twins lets the TP plan propagate through + the backward pass too. These seeds are only used for propagation: the + twins themselves stay unconstrained (handled by the forward/backward + consistency constraints), but their neighbors get determined. + """ + from torch._functorch._aot_autograd.fx_utils import ( + get_param_and_grad_nodes, + get_plain_input_and_grad_nodes, + get_plain_output_and_tangent_nodes, + ) + + graph = self.sharding_optimizer.graph + twin = {} + for _d, (node, grad) in get_param_and_grad_nodes(graph).items(): + if grad is not None: + twin[node] = grad + for _d, (node, grad) in get_plain_input_and_grad_nodes(graph).items(): + if grad is not None: + twin[node] = grad + for _d, (node, tangent) in get_plain_output_and_tangent_nodes(graph).items(): + if tangent is not None: + twin[node] = tangent + + mirrored = [] + for node, ann in self._annotations: + if node in twin: + mirrored.append((twin[node], ann)) + return mirrored + + def propagate_annotations(self, verbose=True, aggressive=False, method="fix"): + """Propagate the registered annotations Shardy-style and turn the + unambiguously-determined nodes into ILP constraints, shrinking the + search space. Returns a :class:`PropagationResult`. + + Call this after the ``annotate_*`` / ``add_*_constraint`` calls and + before :meth:`optimize_placement`. + + With ``aggressive=False`` (the default) only genuine ``Shard`` axes are + pinned, which keeps the full-ILP optimum reachable. ``aggressive=True`` + also pins ``Replicate`` / ``Partial`` axes for a larger reduction at the + cost of possibly forbidding cheaper reshard placements (e.g. sequence + parallelism), so the objective may move slightly off the optimum. + + ``method`` is how each pin is enforced: ``"fix"`` (default) removes the + ruled-out decision variables (shrinks the problem; scales best on large + meshes), ``"constraint"`` adds removable ``== 1`` rows instead. + """ + self._assert_entered() + propagator = ShardingPropagator(self.sharding_optimizer) + seeds = self._annotations + self._mirror_annotations_to_backward() + propagator.run(seeds) + self.propagation_result = propagator.apply_to_optimizer( + aggressive=aggressive, method=method + ) + if verbose: + logger.info( + "Annotation propagation reduced the output-strategy search " + "space by %.1f%% (%d -> %d) via %d per-axis constraints on %d " + "nodes", + 100.0 * self.propagation_result.reduction, + self.propagation_result.strategies_before, + self.propagation_result.strategies_after, + self.propagation_result.axis_constraints, + self.propagation_result.nodes_determined, + ) + return self.propagation_result + def optimize_placement(self, verbose=True): self._assert_entered() diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 6b72878b..1ac30431 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -245,6 +245,17 @@ def __init__( # so that _apply_memory_constraint can exclude constrained params and # remove_constraints can keep this in sync. self._node_constraint_names: dict[str, str] = {} + # Maps node_name → list of (mesh_dim, placement) per-axis constraints. + # A per-axis constraint keeps a param in the memory budget (unlike a full + # node constraint) but restricts which strategies it can use, so the + # budget must compute its best achievable memory ratio over only the + # strategies that satisfy these constraints. + self._node_axis_constraints: dict[ + str, list[tuple[int, Placement]] + ] = defaultdict(list) + # Variables pinned to 0 by axis constraints applied with method="fix". + # Stored so they can be restored by remove_constraints / for re-solving. + self._fixed_vars: list = [] self._name_counters: dict[str, int] = {} t0 = time.perf_counter() self.strats = self.build_sharding_metadata() @@ -911,6 +922,73 @@ def _solve(self, verbose=False): "using a larger mesh." ) + def solve_lp_relaxation(self, verbose=False, frac_tol=1e-6, extract=False): + """Solve the continuous relaxation of the ILP (binary variables relaxed + to [0, 1]) and report diagnostics, restoring the binary categories on + exit so a later ILP solve is unaffected. + + Returns a dict with the relaxation objective (a lower bound on the ILP + optimum), the solve time, the number/fraction of decision variables that + came out fractional, and the solver status. This is the lens for + understanding why constraints (e.g. propagated annotations) speed up the + ILP: a relaxation that is tighter (objective closer to the ILP optimum) + and less fractional leaves branch-and-bound far less work. + + For this sharding problem the relaxation is empirically integral, so the + relaxation optimum equals the ILP optimum. With ``extract=True`` and an + integral solution, the dict also contains a ``"solution"`` key with the + per-node strategy dict (same form as :meth:`get_solution`) — i.e. the LP + relaxation can be used as a much cheaper exact solve, skipping + branch-and-bound. ``"solution"`` is ``None`` when the relaxation came + out fractional. + + Requires the objective to have been set (e.g. via a prior get_solution, + or _set_objective). + """ + variables = self.prob.variables() + original_cats = [v.cat for v in variables] + self._apply_memory_constraint() + t0 = time.perf_counter() + try: + for v in variables: + v.cat = pulp.LpContinuous # bounds are already [0, 1] for binaries + solver = pulp.PULP_CBC_CMD(msg=verbose) + with tempfile.TemporaryDirectory() as tmpdir: + solver.tmpDir = tmpdir + self.prob.solve(solver) + solve_time = time.perf_counter() - t0 + objective = pulp.value(self.prob.objective) + n_fractional = 0 + n_vars = 0 + for v in variables: + val = v.value() + if val is None: + continue + n_vars += 1 + if min(val, 1.0 - val) > frac_tol: + n_fractional += 1 + solution = None + if extract and n_fractional == 0: + self.selected_keys = [ + key + for key, dv in self.decision_vars.items() + if dv.var.value() is not None and dv.var.value() > 0.5 + ] + for root_key in list(self.selected_keys): + self.selected_keys.extend(self._root_to_linked.get(root_key, [])) + solution = self._to_orig_solution(self._extract_and_validate_solution()) + finally: + for v, cat in zip(variables, original_cats): + v.cat = cat + return { + "objective": objective, + "solve_time": solve_time, + "n_fractional": n_fractional, + "n_vars": n_vars, + "status": pulp.LpStatus[self.prob.status], + "solution": solution, + } + def _extract_and_validate_solution(self): """Validate the ILP solution and return the optimal strategy per node.""" selected_by_node = {} @@ -1651,7 +1729,14 @@ def _apply_memory_constraint(self): continue node_idx = self.node_map[node] num_out_strat = len(self.strats[node].strategies) + # Per-axis constraints restrict which strategies this param may use, + # which raises its best achievable memory ratio (e.g. a param pinned + # to Replicate on the tensor axis can no longer be sharded there). + # The budget must reflect that, or it would under-allocate and make + # the problem spuriously infeasible. + axis_constraints = self._node_axis_constraints.get(node.name, []) ratios: list[float] = [] + allowed_ratios: list[float] = [] for out_idx in range(num_out_strat): dv = self._resolve_decision_var((node_idx, 0, out_idx, 0)) spec: DTensorSpec = dv.input_spec @@ -1663,7 +1748,12 @@ def _apply_memory_constraint(self): ratio = new_size / old_size ratios.append(ratio) elms.append(dv.var * ratio) - best_ratio: float = min(ratios) + out_spec = self.strats[node].strategies[out_idx].output_specs + if isinstance(out_spec, DTensorSpec) and all( + out_spec.placements[m] == p for m, p in axis_constraints + ): + allowed_ratios.append(ratio) + best_ratio: float = min(allowed_ratios) if allowed_ratios else min(ratios) budget_low += max(best_ratio, memory_factor_low) budget_high += max(best_ratio, memory_factor_high) @@ -1717,6 +1807,86 @@ def add_node_constraint(self, node, placement=None, constraint_name=None): self._node_constraint_names[name] = node.name return names + def add_node_axis_constraint( + self, node, mesh_dim, placement, constraint_name=None, method="constraint" + ): + """Force a node's output placement on a single mesh axis, leaving the + other axes free for the ILP. + + This is the per-mesh-axis analogue of :meth:`add_node_constraint` and is + what sharding propagation emits: it can pin the tensor-parallel axis of a + weight while leaving the data axis open for FSDP. Unlike + :meth:`add_node_constraint` it does *not* register the node in + ``_node_constraint_names``, so a partially-constrained parameter is still + counted by the memory budget and can be sharded on its free axes. + + ``method`` controls how the pin is enforced: + + * ``"constraint"`` adds an ``== 1`` equality over the matching decision + variables (removable by name via :meth:`remove_constraints`). + * ``"fix"`` instead sets the upper bound of the *non-matching* decision + variables to 0. This shrinks the problem (the solver's presolve drops + fixed columns) rather than adding a row, which scales much better on + large meshes where adding thousands of equality rows otherwise slows + the solve. It is not removable by constraint name. + + For nodes with tuple output_specs the placement is matched against the + first DTensorSpec element, matching :meth:`add_node_constraint`. + """ + node = self._normalize_node(node) + if constraint_name is None: + constraint_name = "axis_constraint" + self._constraint_log.append( + ( + "add_node_axis_constraint", + { + "node_name": node.name, + "mesh_dim": mesh_dim, + "placement": placement, + "constraint_name": constraint_name, + "method": method, + }, + ) + ) + assert node in self.strats, (node, self.strats.keys()) + strat = self.strats[node] + output_constraint_indices = [] + for i, s in enumerate(strat.strategies): + specs = s.output_specs + spec = None + if isinstance(specs, DTensorSpec): + spec = specs + elif isinstance(specs, (list, tuple)): + spec = next((x for x in specs if isinstance(x, DTensorSpec)), None) + if spec is not None and spec.placements[mesh_dim] == placement: + output_constraint_indices.append(i) + if len(output_constraint_indices) == 0: + raise RuntimeError( + f"Couldn't find a strategy for {node} with {placement} on mesh " + f"dim {mesh_dim} (constraint {constraint_name})" + ) + self._node_axis_constraints[node.name].append((mesh_dim, placement)) + if method == "fix": + self._fix_node_output_indices(node, set(output_constraint_indices)) + return [] + return self._add_node_constraint( + node, + output_constraint_indices=output_constraint_indices, + constraint_name=constraint_name, + ) + + def _fix_node_output_indices(self, node, keep_out_idxs): + """Pin a node's output strategy by fixing every decision variable whose + out_idx is not in ``keep_out_idxs`` to 0 (upper bound).""" + node_idx = self.node_map[node] + for argi, out_idx, inp_idx in self.walk_over_options(node): + if out_idx in keep_out_idxs: + continue + var = self._get_pulp_variable((node_idx, argi, out_idx, inp_idx)) + if var.upBound != 0: + var.upBound = 0 + self._fixed_vars.append(var) + def _add_io_placement_constraints( self, nodes_dict, diff --git a/autoparallel/propagation.py b/autoparallel/propagation.py new file mode 100644 index 00000000..ae1e5366 --- /dev/null +++ b/autoparallel/propagation.py @@ -0,0 +1,487 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +""" +Shardy-like sharding propagation to seed and shrink the ILP search space. + +The ILP in :mod:`optimize_sharding` enumerates, for every node, every valid +combination of input/output placements and lets the solver pick the global +optimum. For large models this search space is enormous even though, in +practice, a handful of user decisions ("these weights are tensor-parallel", +"the batch is data-parallel") already pin down the strategy for the vast +majority of the graph. + +This module lets the user attach a small number of *sharding annotations* and +then propagates them through the graph the way `Shardy +`_ does: it pushes each known sharding along +edges that require no resharding, narrowing every node's set of candidate +strategies until the unambiguous nodes are fully determined. Determined nodes +are turned into ILP constraints, which collapses the search space and the solve +time while leaving the genuinely ambiguous decisions (and where to place the +necessary collectives) to the ILP. + +Key design points that mirror Shardy: + +* **Per-mesh-axis propagation.** A placement is propagated one mesh axis at a + time. This is what lets, e.g., the tensor-parallel sharding of a weight flow + through a matmul on the ``tp`` axis while the ``dp`` axis is independently + resolved (data-parallel batch, with FSDP all-gathers left to the ILP). It is + the analogue of Shardy projecting tensor shardings onto per-factor axes. +* **Conservative, reshard-free propagation.** Along an edge we only narrow a + consumer to the placements it can take *without* a reshard from the producer + (zero ``redistribute_cost``). At a genuine reshard boundary (a necessary + collective, e.g. an all-reduce or all-gather) no zero-cost option exists, so + propagation stops there and the ILP decides the collective. This never + empties a domain. +* **Priority rounds.** Annotations carry a priority (lower = applied first, + matching Shardy). Data/activation annotations propagate before weight + annotations so that, where they compete for the same mesh axis (the ``dp`` + axis of a matmul), the data-parallel sharding wins and the weight is + all-gathered rather than the activation being resharded. +""" + +import logging +from collections import defaultdict, deque +from dataclasses import dataclass, field +from typing import Optional + +from torch.distributed.tensor._dtensor_spec import DTensorSpec +from torch.distributed.tensor.placement_types import Placement + +logger = logging.getLogger(__name__) + +# A per-axis placement value; ``None`` means "open" (unconstrained on that axis). +AxisPlacement = Optional[Placement] + + +@dataclass(frozen=True) +class ShardingAnnotation: + """A user-provided sharding hint for one tensor (graph node). + + Args: + placements: one entry per mesh dimension. Each entry is a + :class:`Placement` (e.g. ``Shard(0)``, ``Replicate()``) or ``None`` + to leave that mesh axis open for propagation / the ILP to decide. + Leaving an axis open is the common case for weights: the user pins + the tensor-parallel axis and lets FSDP on the data axis be chosen by + the optimizer. + priority: lower numbers are propagated first. Activation/IO hints + should have a smaller priority than weight hints so the + data-parallel axis wins shared-axis conflicts. + """ + + placements: tuple[AxisPlacement, ...] + priority: int = 0 + + +# Micro-strategy: a single strategy projected onto one mesh axis. +# ``in_reqs`` is the per-axis input placement required for each tensor argument +# (``None`` for non-tensor / undefined args); ``out`` is the per-axis output +# placement produced. +@dataclass(frozen=True) +class _Micro: + in_reqs: tuple[AxisPlacement, ...] + out: AxisPlacement + + +@dataclass +class PropagationResult: + """Summary of a propagation run, for logging and tests.""" + + determined: dict = field(default_factory=dict) # node -> [(mesh_dim, placement)] + strategies_before: int = 0 + strategies_after: int = 0 + nodes_touched: int = 0 + nodes_determined: int = 0 + axis_constraints: int = 0 + + @property + def reduction(self) -> float: + if self.strategies_before == 0: + return 0.0 + return 1.0 - self.strategies_after / self.strategies_before + + +class ShardingPropagator: + """Propagates sharding annotations over an optimizer's strategy graph. + + The propagator works on the optimizer's concrete graph and reuses its + per-node ``OpStrategy`` list (``optimizer.strats``) as the per-op sharding + rules. It maintains, for every single-output node and every mesh axis, the + set of still-feasible per-axis (input-requirement, output) micro-strategies + and shrinks them to a fixed point. + """ + + def __init__(self, optimizer): + self.opt = optimizer + self.mesh = optimizer.mesh + self.ndim = optimizer.mesh.ndim + + # node -> list (indexed by mesh dim) of list[_Micro] + self.micros: dict = {} + # node -> list (indexed by mesh dim) of set[int] (feasible micro indices) + self.dom: dict = {} + # nodes whose domain has been narrowed below the initial full set + self.touched: set = set() + self._initial_strategy_count: dict = {} + + self._build_micros() + + # ---- construction ---- + + def _build_micros(self): + for node, op_strat in self.opt.strats.items(): + if node.op == "output": + continue + strategies = op_strat.strategies + if not strategies: + continue + # Multi-output nodes (tuple output_specs, e.g. SDPA) are propagation + # barriers: there is no single output placement to project, so we + # neither narrow them nor propagate across them. Their getitem + # users are single-output and handled normally. + if not isinstance(strategies[0].output_specs, DTensorSpec): + continue + + args = self.opt._all_input_nodes(node) + n_args = len(args) + self._initial_strategy_count[node] = len(strategies) + + per_axis_index: list = [dict() for _ in range(self.ndim)] + per_axis_micros: list = [[] for _ in range(self.ndim)] + for s in strategies: + out_pl = s.output_specs.placements + in_pls = [] + for a in range(n_args): + isp = s.input_specs[a] if a < len(s.input_specs) else None + in_pls.append( + isp.placements if isinstance(isp, DTensorSpec) else None + ) + for m in range(self.ndim): + in_reqs = tuple(None if pl is None else pl[m] for pl in in_pls) + micro = _Micro(in_reqs=in_reqs, out=out_pl[m]) + idx = per_axis_index[m] + if micro not in idx: + idx[micro] = len(per_axis_micros[m]) + per_axis_micros[m].append(micro) + self.micros[node] = per_axis_micros + self.dom[node] = [ + set(range(len(per_axis_micros[m]))) for m in range(self.ndim) + ] + + # ---- accessors ---- + + def _out_set(self, node, m) -> set: + micros = self.micros[node][m] + return {micros[i].out for i in self.dom[node][m]} + + def _in_req_set(self, node, m, a) -> set: + micros = self.micros[node][m] + return {micros[i].in_reqs[a] for i in self.dom[node][m]} + + def _consumer_edges(self, node): + """Yield (consumer, arg_index) for each tensor edge out of ``node``.""" + for user in node.users: + if user not in self.dom: + continue + in_nodes = self.opt._all_input_nodes(user) + for a, src in enumerate(in_nodes): + if src is node: + yield user, a + + # ---- seeding ---- + + def seed(self, node, placements: tuple) -> bool: + node = self.opt._normalize_node(node) + if node not in self.dom: + logger.debug("seed: %s is not a single-output node, ignoring", node) + return False + changed = False + for m in range(self.ndim): + want = placements[m] if m < len(placements) else None + if want is None: + continue + micros = self.micros[node][m] + # Seeding is authoritative: recompute from the full strategy set so a + # user annotation overrides any earlier (lower-priority) propagation + # that may have narrowed this axis away from the annotated value. + keep = {i for i in range(len(micros)) if micros[i].out == want} + if not keep: + available = {micros[i].out for i in range(len(micros))} + raise ValueError( + f"Annotation {placements} is not achievable for node " + f"{node} on mesh dim {m}: this op only supports " + f"{available} on that axis" + ) + if keep != self.dom[node][m]: + self.dom[node][m] = keep + changed = True + if changed: + self.touched.add(node) + return changed + + # ---- narrowing ---- + + def _narrow_from_producers(self, node) -> bool: + """Narrow ``node`` (as a consumer) toward reshard-free inputs.""" + changed = False + args = self.opt._all_input_nodes(node) + for a, producer in enumerate(args): + if producer not in self.dom: + continue # barrier or non-tensor producer + for m in range(self.ndim): + prod_outs = self._out_set(producer, m) + cur = self.dom[node][m] + micros = self.micros[node][m] + keep = {i for i in cur if micros[i].in_reqs[a] in prod_outs} + # Only tighten when a zero-reshard option exists; an empty keep + # means this edge is a genuine reshard boundary -> leave it to + # the ILP. + if keep and keep != cur: + self.dom[node][m] = keep + changed = True + return changed + + def _narrow_from_consumer(self, node) -> bool: + """Narrow ``node`` (as a producer) toward what its single consumer wants. + + Restricted to single-consumer producers: a multi-consumer value (e.g. a + residual stream) may legitimately be resharded for some consumers, so we + do not let one consumer dictate it. + + Placeholders (parameters, buffers, graph inputs) are never narrowed this + way: their placement is the *stored* sharding, which legitimately differs + from the *compute* sharding the consumer needs by a reshard (e.g. an FSDP + all-gather on the data axis). Inferring the storage sharding from the + consumer would wrongly pin, e.g., a weight to Replicate on the data axis + and defeat FSDP. A placeholder's sharding comes only from its own + annotation; everything else about it is left to the ILP. + """ + if node.op in ("placeholder", "get_attr"): + return False + edges = list(self._consumer_edges(node)) + if len(edges) != 1: + return False + consumer, a = edges[0] + changed = False + for m in range(self.ndim): + cons_reqs = self._in_req_set(consumer, m, a) + cur = self.dom[node][m] + micros = self.micros[node][m] + keep = {i for i in cur if micros[i].out in cons_reqs} + if keep and keep != cur: + self.dom[node][m] = keep + changed = True + return changed + + def _narrow_node(self, node) -> bool: + c1 = self._narrow_from_producers(node) + c2 = self._narrow_from_consumer(node) + changed = c1 or c2 + if changed: + self.touched.add(node) + return changed + + def propagate(self): + """Run the worklist narrowing to a fixed point.""" + wl = deque(self.dom.keys()) + inq = set(self.dom.keys()) + steps = 0 + while wl: + node = wl.popleft() + inq.discard(node) + steps += 1 + if not self._narrow_node(node): + continue + # Re-enqueue neighbors whose domains may now narrow further. + neighbors = list(self.opt._all_input_nodes(node)) + neighbors += [u for u in node.users] + for nb in neighbors: + if nb in self.dom and nb not in inq: + wl.append(nb) + inq.add(nb) + logger.debug("propagation fixpoint reached in %d worklist steps", steps) + + # ---- results ---- + + def determined(self) -> dict: + """node -> list[(mesh_dim, placement)] for every determined axis of a + node that propagation actually touched.""" + res = {} + for node in self.dom: + if node not in self.touched: + continue + axes = [] + for m in range(self.ndim): + outs = self._out_set(node, m) + if len(outs) == 1: + axes.append((m, next(iter(outs)))) + if axes: + res[node] = axes + return res + + def _feasible_strategy_count(self, node, determined_axes) -> int: + """How many of ``node``'s strategies satisfy all determined axes.""" + strategies = self.opt.strats[node].strategies + count = 0 + for s in strategies: + spec = s.output_specs + if not isinstance(spec, DTensorSpec): + count += 1 + continue + if all(spec.placements[m] == p for m, p in determined_axes): + count += 1 + return count + + def run(self, annotations) -> dict: + """Seed ``annotations`` in priority order and propagate to a fixed point. + + ``annotations`` is a list of ``(node, ShardingAnnotation)``. Returns the + ``determined()`` mapping. + """ + by_priority: dict = defaultdict(list) + for node, ann in annotations: + by_priority[ann.priority].append((node, ann)) + for priority in sorted(by_priority): + for node, ann in by_priority[priority]: + self.seed(node, ann.placements) + self.propagate() + return self.determined() + + def _paired_boundary_nodes(self) -> set: + """Backward nodes tied to a forward node by a forward/backward + consistency constraint: parameter gradients, input gradients, and output + tangents. These must be left to the pairing (which mirrors the forward + decision onto them); constraining them independently can contradict it. + """ + from torch._functorch._aot_autograd.fx_utils import ( + get_param_and_grad_nodes, + get_plain_input_and_grad_nodes, + get_plain_output_and_tangent_nodes, + ) + + graph = self.opt.graph + nodes = set() + for _p, grad in get_param_and_grad_nodes(graph).values(): + if grad is not None: + nodes.add(grad) + for _i, grad in get_plain_input_and_grad_nodes(graph).values(): + if grad is not None: + nodes.add(grad) + for _o, tangent in get_plain_output_and_tangent_nodes(graph).values(): + if tangent is not None: + nodes.add(tangent) + return nodes + + def _backward_node_set(self) -> set: + """Nodes belonging to the backward pass: everything reachable from a + tangent (incoming-gradient) placeholder. + + Propagation does not constrain these. Their sharding is tied to the + forward pass by the optimizer's forward/backward consistency constraints + (param<->grad, input<->grad, output<->tangent), so constraining them + independently risks contradicting that pairing (e.g. forcing a weight's + gradient to a placement its parameter cannot take). Leaving them to the + ILP keeps the problem feasible while the forward constraints already + collapse most of the backward search space through the pairing. + """ + seeds = [ + n + for n in self.opt.graph.nodes + if n.op == "placeholder" and n.name.startswith("tangents") + ] + backward = set() + stack = list(seeds) + while stack: + n = stack.pop() + for u in n.users: + if u not in backward: + backward.add(u) + stack.append(u) + return backward + + def _total_strategy_count(self) -> int: + total = 0 + for node, op_strat in self.opt.strats.items(): + if node.op == "output": + continue + total += len(op_strat.strategies) + return total + + def apply_to_optimizer( + self, forward_only=False, aggressive=False, method="fix" + ) -> PropagationResult: + """Emit per-axis constraints for every determined axis of every touched + node and return a summary of the search-space reduction. + + Nodes the user already constrained explicitly are skipped, as are the + forward/backward *paired boundary* nodes (parameter/input gradients and + output tangents), whose sharding is decided by the pairing rather than + propagation. When ``forward_only`` is set, all backward-pass nodes are + skipped (more conservative; only the forward graph is constrained). A + node is also skipped if its determined axes do not co-occur in any single + strategy (a safety net, not expected in practice). + + By default (``aggressive=False``) an axis is only pinned when it is a + genuine ``Shard``. A Shard encodes the tensor-parallel structure the + annotations describe and is invariant in the optimum. ``Replicate`` and + ``Partial`` are deliberately *not* pinned: + + * Pinning ``Replicate`` would forbid the ILP from instead sharding that + axis (e.g. choosing sequence parallelism on the residual stream). + * ``Partial`` is a pending reduction whose collective (all-reduce / + reduce-scatter) the ILP places; pinning it fixes where the reduction + happens and can even be infeasible (a Partial value cannot be added to + a Replicate residual without first reducing it). + + Both are genuine cost tradeoffs, so leaving them open keeps the optimum + reachable while costing little search-space reduction. + + ``method`` is forwarded to :meth:`ShardingOptimizer.add_node_axis_constraint`: + ``"fix"`` (default) removes the ruled-out decision variables so the + problem actually shrinks, ``"constraint"`` adds equality rows instead. + """ + determined = self.determined() + already = set(self.opt._node_constraint_names.values()) + excluded = self._paired_boundary_nodes() + if forward_only: + excluded |= self._backward_node_set() + + result = PropagationResult(determined=determined) + result.strategies_before = self._total_strategy_count() + result.nodes_touched = len(self.touched) + + strategies_saved = 0 + for node, axes in determined.items(): + if node.name in already or node in excluded: + continue + pin_axes = [(m, p) for m, p in axes if aggressive or p.is_shard()] + if not pin_axes: + continue + full = len(self.opt.strats[node].strategies) + feasible = self._feasible_strategy_count(node, pin_axes) + if feasible == 0 or feasible == full: + continue + for m, p in pin_axes: + self.opt.add_node_axis_constraint( + node, m, p, constraint_name="propagated", method=method + ) + result.axis_constraints += 1 + result.nodes_determined += 1 + strategies_saved += full - feasible + + result.strategies_after = result.strategies_before - strategies_saved + logger.info( + "propagation: touched %d nodes, constrained %d nodes with %d " + "per-axis constraints; output-strategy choices %d -> %d (%.1f%% " + "reduction)", + result.nodes_touched, + result.nodes_determined, + result.axis_constraints, + result.strategies_before, + result.strategies_after, + 100.0 * result.reduction, + ) + return result diff --git a/docs/README.md b/docs/README.md index 9299286f..4aa2dc2d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -23,5 +23,6 @@ If you're new to the project, use the reading order below. ## Advanced usage +- [Sharding Annotations and Shardy-like Propagation](sharding_annotations.md) - [Using `local_map` for MoE and Custom Communication Patterns](local_map_and_moe.md) - [Saving and Loading Optimizer State](save_load.md) diff --git a/docs/sharding_annotations.md b/docs/sharding_annotations.md new file mode 100644 index 00000000..b9248cb9 --- /dev/null +++ b/docs/sharding_annotations.md @@ -0,0 +1,183 @@ +# Sharding Annotations and Shardy-like Propagation + +By default AutoParallel hands the entire sharding decision to the ILP: every +node enumerates every valid placement and the solver picks the global optimum. +That is the right default for a fresh model, but at scale the search space is +large even though the user often already knows the high-level plan — "the +attention and MLP projections are tensor-parallel; the batch is data-parallel". + +This page describes how to express that plan as a few **sharding annotations** +and have AutoParallel **propagate** them through the graph the way +[Shardy](https://github.com/openxla/shardy) does, turning the unambiguous part +of the graph into ILP constraints. This shrinks the search space and the solve +time while leaving the genuine cost tradeoffs to the solver. With a typical +tensor-parallel annotation on LLaMA-3 it reaches the *same* objective as the +full ILP on a noticeably smaller problem. + +If you are new to the project, start with +[Getting Started](getting_started.md) and +[How AutoParallel Chooses a Strategy](how_autoparallel_chooses_a_strategy.md). + +## The annotation API + +Annotations are added on the `AutoParallel` context manager, after the input / +output constraints and before `optimize_placement`: + +```python +with AutoParallel(model, input_fn, mesh) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0), Replicate())]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + + # Annotate the tensor-parallel plan. A glob matches the weight in every + # layer at once. Only the tp axis is pinned; the data axis is left open. + column_parallel = (None, Shard(0)) # shard the output dim + row_parallel = (None, Shard(1)) # shard the input dim + for proj in ["wq", "wk", "wv"]: + autop.annotate_parameter(f"layers.*.attention.{proj}.weight", column_parallel) + autop.annotate_parameter("layers.*.attention.wo.weight", row_parallel) + for proj in ["w1", "w3"]: + autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", column_parallel) + autop.annotate_parameter("layers.*.feed_forward.w2.weight", row_parallel) + + autop.propagate_annotations() # propagate + constrain + sharding = autop.optimize_placement() +``` + +A placement is a tuple with one entry per mesh dimension. Each entry is a +`Placement` (`Shard(d)`, `Replicate()`, ...) or **`None`** to leave that mesh +axis *open* for propagation / the ILP to decide. Leaving the data axis open is +the common case for weights: you pin the tensor-parallel axis and let the +optimizer choose FSDP vs DDP on the data axis. + +The available annotation methods are: + +- `annotate_parameter(fqn, placements, priority=1)` — `fqn` is a parameter + fully-qualified name or a glob pattern (e.g. `"layers.*.attention.wq.weight"`). +- `annotate_input(idx, placements, priority=0)` / + `annotate_output(idx, placements, priority=0)` — graph input/output by index. +- `annotate_node(node, placements, priority=0)` — an arbitrary FX node. + +`priority` controls the order annotations propagate (lower first, matching +Shardy). Activations/IO default to a higher priority than weights so that where +they compete for the same mesh axis (the data axis of a matmul) the +data-parallel sharding wins and the weight is all-gathered, rather than the +activation being resharded. + +`propagate_annotations()` returns a `PropagationResult` summarizing the +reduction (`nodes_determined`, `axis_constraints`, `reduction`). + +## How propagation works + +Propagation mirrors the structure of Shardy's propagation, expressed over +AutoParallel's existing per-node strategy lists (which already encode each op's +sharding rule): + +- **Per-mesh-axis.** A placement is propagated one mesh axis at a time. This is + what lets a weight's tensor-parallel sharding flow through a matmul on the + `tp` axis while the `dp` axis is resolved independently (data-parallel batch, + with FSDP all-gathers left to the ILP). It is the analogue of Shardy + projecting tensor shardings onto per-factor axes. + +- **Reshard-free.** Along an edge a consumer is only narrowed to the placements + it can take *without* a reshard from the producer (zero redistribution cost). + At a genuine reshard boundary — a necessary collective such as an all-reduce + or all-gather — no zero-cost option exists, so propagation stops there and the + ILP decides the collective. + +- **To a fixed point.** A worklist re-examines a node's neighbors whenever its + set of candidate shardings shrinks, until nothing changes. + +- **Priority rounds.** Annotations propagate in priority order; later rounds + cannot override what an earlier round determined. + +Once propagation reaches a fixed point, every mesh axis of a node whose sharding +became unambiguous is turned into a per-axis ILP constraint +(`add_node_axis_constraint`), which constrains that one axis and leaves the rest +of the node free. + +### What is and isn't pinned + +Propagation deliberately only pins genuine **`Shard`** placements — the +tensor-parallel structure the annotations describe, which is invariant in the +optimum. It does *not* pin: + +- **`Replicate`** — pinning it would forbid the ILP from instead sharding that + axis (for example choosing sequence parallelism on the residual stream). +- **`Partial`** — a pending reduction whose collective the ILP places; pinning + it fixes where the reduction happens and can even be infeasible (a `Partial` + value cannot be added to a `Replicate` residual without first reducing it). + +Both are genuine cost tradeoffs, so leaving them open keeps the optimum +reachable at little cost to the reduction. + +Two more correctness rules keep the constraint set feasible and faithful: + +- **Parameters are sources only.** A parameter's placement is its *stored* + sharding, which legitimately differs from the *compute* sharding a consumer + needs by a reshard (an FSDP all-gather). Propagation never infers a + parameter's sharding from its consumers, so an open data axis stays free for + FSDP, and a per-axis parameter constraint still counts toward the memory + budget on its free axes. +- **Backward pass via the pairing.** The forward/backward consistency + constraints already tie each gradient to its forward tensor, so the + parameter/input gradients and output tangents are left for the pairing to + decide; the rest of the backward graph is constrained normally (and the + forward annotations are mirrored onto the gradients to drive that). + +## How a pin is applied: variable fixing vs constraints + +`propagate_annotations(method=...)` (forwarded to +`ShardingOptimizer.add_node_axis_constraint`) controls how each determined axis +is committed to the ILP: + +- **`"fix"` (default)** sets the upper bound of the ruled-out decision variables + to 0, so the solver's presolve drops those columns and the problem actually + shrinks. +- **`"constraint"`** adds an `== 1` equality row over the matching variables. + It is removable by name, but on a large mesh adding thousands of rows without + removing any columns can *slow* the solve. + +Variable fixing is strictly better for solve time (and never worse for the +objective), which is why it is the default. + +## Solver performance and the LP relaxation + +`ShardingOptimizer.solve_lp_relaxation()` solves the continuous relaxation +(binaries relaxed to `[0, 1]`) and reports the objective, solve time, and how +many variables came out fractional. It exposes two facts that matter for +performance: + +1. **The relaxation is integral.** On LLaMA-3 (2D and 3D meshes), with and + without annotations, the LP relaxation comes out with *zero* fractional + variables and an integrality gap of 0% — its optimum already *is* the integer + optimum. So `solve_lp_relaxation(extract=True)` returns a valid optimal + per-node strategy dict (same form as `get_solution`) while skipping + branch-and-bound, which is several times faster than the MILP solve (e.g. on + the 16-layer 2D model, ~10s vs ~50s; on a 2M-variable 3D problem, ~45s vs + ~160s). This is the single biggest available speedup and is exact whenever + the relaxation is integral (it falls back to `None` when it is not). + +2. **Where annotations help the MILP.** Because the relaxation is integral, + there is little branch-and-bound to cut, so the annotation speedup is + scale-dependent: on a ~400k-variable problem the MILP overhead is a large + fraction and pinning the TP structure gives ~1.7–1.8×; on a ~2M-variable + problem the solve is dominated by the relaxation/model size itself, so the + speedup shrinks toward ~1× even though the *search space* shrinks more (the + extra mesh axis gives more axes to pin — e.g. −59% strategy choices on 3D vs + −36% on 2D). The annotation speedup on the *LP* solve is correspondingly + modest (~1.1–1.4×). The takeaway: annotations reduce the search space and + keep the optimum exact, but for raw solve time on this (integral) problem the + larger lever is solving the relaxation directly. + +A separate, orthogonal cost is that building the ILP for a 3-axis mesh is slow: +per-node strategy enumeration grows with the number of mesh axes (it is cubic +for a 3-axis mesh, dominated by the 4D attention tensors), which is independent +of the solve and of annotations. + +## Example + +`examples/example_llama3_annotated.py` runs the full ILP and the +annotated+propagation path on a LLaMA-3-1B model on a 2D mesh and prints the +comparison: the annotated path reaches the same objective on a search space +reduced by roughly a third, with a correspondingly faster solve. diff --git a/examples/example_llama3_annotated.py b/examples/example_llama3_annotated.py new file mode 100644 index 00000000..7e1f1ecb --- /dev/null +++ b/examples/example_llama3_annotated.py @@ -0,0 +1,145 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +"""Sharding annotations + Shardy-like propagation on LLaMA3-1B (2D mesh). + +By default AutoParallel hands the whole sharding decision to the ILP. At scale +a user usually already knows the tensor-parallel plan ("these projections are +column-parallel, those are row-parallel"). This example shows how to express +that plan as a few *annotations*, propagate it through the graph the way Shardy +does, and turn the unambiguous part of the graph into ILP constraints. + +The annotations pin only the **tensor-parallel (tp) axis** of the transformer +body weights. Everything else -- the data/FSDP axis, the residual stream +(replicate vs sequence-parallel), the vocab/embedding sharding, and where the +collectives go -- is left to the ILP. Propagation then determines the sharding +of the activations that *follow* from the plan with no resharding and constrains +them, which shrinks the search space and the solve time while leaving the +genuine cost tradeoffs to the solver. + +Run it (no GPUs needed -- uses a fake process group): + + python examples/example_llama3_annotated.py +""" + +import logging +import time + +import pulp +import torch +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import ( + Transformer, + TransformerModelArgs, + apply_ac, +) +from autoparallel.api import AutoParallel + +logging.basicConfig(level=logging.WARNING) + +world_size = 64 +fake_store = FakeStore() +torch.distributed.init_process_group( + "fake", store=fake_store, rank=0, world_size=world_size +) + +# 2D mesh: data/FSDP on dp, tensor-parallel on tp. +dp, tp = world_size // 8, 8 +mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", (dp, tp), mesh_dim_names=("dp", "tp") +) + +# Small-batch / long-sequence regime, where tensor parallelism is worthwhile. +vocab_size = 128256 +seqlen = 2048 +batch_size = 2 * dp + + +def model_fn(): + # LLaMA-3.2-1B-ish config. + return Transformer( + TransformerModelArgs( + dim=2048, + n_layers=16, + n_heads=32, + n_kv_heads=8, + ffn_dim_multiplier=1.5, + multiple_of=256, + rope_theta=500000, + vocab_size=vocab_size, + max_seq_len=seqlen, + ) + ) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") + + +def annotate_tp_plan(autop): + """The 'conscious' tensor-parallel plan, as a handful of annotations. + + Only the tp axis is pinned (the data axis is left ``None`` = open). A glob + pattern annotates the matching weight in every layer at once. + """ + column_parallel = (None, Shard(0)) # shard the output dim (dim 0 of [out, in]) + row_parallel = (None, Shard(1)) # shard the input dim (dim 1 of [out, in]) + for proj in ["wq", "wk", "wv"]: + autop.annotate_parameter(f"layers.*.attention.{proj}.weight", column_parallel) + autop.annotate_parameter("layers.*.attention.wo.weight", row_parallel) + for proj in ["w1", "w3"]: + autop.annotate_parameter( + f"layers.*.feed_forward.{proj}.weight", column_parallel + ) + autop.annotate_parameter("layers.*.feed_forward.w2.weight", row_parallel) + + +with torch.device("meta"): + model = model_fn() +apply_ac(model, mode="full") + +with AutoParallel(model, input_fn, mesh, repeated_subgraphs=True) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([(Shard(0), Replicate())]) + autop.add_output_constraints([(Shard(0), Shard(2))]) # vocab-parallel logits + opt = autop.sharding_optimizer + print( + f"ILP: {len(opt.strats)} nodes, {len(opt.decision_vars)} decision variables " + f"on a ({dp}, {tp}) mesh" + ) + + # --- Baseline: full ILP, no annotations --- + t = time.perf_counter() + autop.optimize_placement(verbose=False) + t_baseline = time.perf_counter() - t + obj_baseline = pulp.value(opt.prob.objective) + print( + f"baseline full ILP : objective {obj_baseline:11.1f} solve {t_baseline:6.1f}s" + ) + + # --- Annotated: propagate the TP plan, then solve the reduced problem --- + annotate_tp_plan(autop) + result = autop.propagate_annotations(verbose=False) + t = time.perf_counter() + opt.resolve(verbose=False) + t_annotated = time.perf_counter() - t + obj_annotated = pulp.value(opt.prob.objective) + print( + f"annotated + propag: objective {obj_annotated:11.1f} solve {t_annotated:6.1f}s" + ) + + gap = 100 * (obj_annotated - obj_baseline) / obj_baseline + print( + f"\npropagation pinned {result.nodes_determined} nodes " + f"({result.axis_constraints} per-axis constraints), shrinking the " + f"output-strategy search space by {100 * result.reduction:.1f}% " + f"({result.strategies_before} -> {result.strategies_after})" + ) + print( + f"objective gap vs full ILP: {gap:+.2f}% " + f"solve speedup: {t_baseline / max(t_annotated, 1e-9):.1f}x" + ) diff --git a/tests/test_propagation.py b/tests/test_propagation.py new file mode 100644 index 00000000..34bb7af2 --- /dev/null +++ b/tests/test_propagation.py @@ -0,0 +1,222 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import pulp +import pytest +import torch +import torch.nn.functional as F +from conftest import apply_cuda_patches +from torch import nn +from torch._functorch._aot_autograd.fx_utils import get_param_and_grad_nodes +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel.api import AutoParallel +from autoparallel.propagation import ShardingAnnotation, ShardingPropagator + + +class TPBlock(nn.Module): + """A minimal transformer block: attention + SwiGLU FFN, the structure a + column/row-parallel tensor-parallel plan applies to.""" + + def __init__(self, dim=512, hidden=1024, nheads=8): + super().__init__() + self.nheads = nheads + self.wq = nn.Linear(dim, dim, bias=False) + self.wk = nn.Linear(dim, dim, bias=False) + self.wv = nn.Linear(dim, dim, bias=False) + self.wo = nn.Linear(dim, dim, bias=False) + self.w1 = nn.Linear(dim, hidden, bias=False) + self.w2 = nn.Linear(hidden, dim, bias=False) + self.w3 = nn.Linear(dim, hidden, bias=False) + + def forward(self, x): + q, k, v = self.wq(x), self.wk(x), self.wv(x) + q = q.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3) + k = k.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3) + v = v.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3) + o = F.scaled_dot_product_attention(q, k, v) + o = o.permute(0, 2, 1, 3).flatten(-2) + h = self.wo(o) + x + return h + self.w2(F.silu(self.w1(h)) * self.w3(h)) + + +def _input_fn(): + bs = 32 + return torch.randn(bs, 128, 512, device="cuda", requires_grad=True) + + +def _enter_autop(mesh): + with torch.device("meta"): + model = TPBlock() + autop = AutoParallel(model, _input_fn, mesh) + autop.__enter__() + autop.add_parameter_memory_constraint(low=None, high=None) + x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([x_sharding]) + return autop + + +def _annotate_tp(autop): + col, row = (None, Shard(0)), (None, Shard(1)) + for proj in ["wq", "wk", "wv", "w1", "w3"]: + autop.annotate_parameter(f"{proj}.weight", col) + for proj in ["wo", "w2"]: + autop.annotate_parameter(f"{proj}.weight", row) + + +@apply_cuda_patches +def test_propagation_matches_full_ilp(device_mesh_2d): + """Annotating the TP plan and propagating shrinks the search space while the + reduced ILP reaches the same optimum as the full ILP.""" + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + autop.optimize_placement(verbose=False) + obj_full = pulp.value(opt.prob.objective) + + _annotate_tp(autop) + result = autop.propagate_annotations(verbose=False) + opt.resolve(verbose=False) + obj_annotated = pulp.value(opt.prob.objective) + + assert opt.prob.status == 1 # Optimal + # Same optimum (propagation only pins reshard-free, unambiguous sharding). + assert obj_annotated == pytest.approx(obj_full, rel=1e-6) + # And it actually pruned a meaningful chunk of the search space. + assert result.reduction > 0.1 + assert result.nodes_determined > 0 + finally: + autop.__exit__(None, None, None) + + +@apply_cuda_patches +def test_lp_relaxation_is_integral_and_exact(device_mesh_2d): + """The LP relaxation of the sharding ILP is integral here, so solving it is a + cheaper exact solve: same objective as the ILP, with an extractable solution.""" + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + autop.optimize_placement(verbose=False) + obj_ilp = pulp.value(opt.prob.objective) + + lp = opt.solve_lp_relaxation(extract=True) + assert lp["n_fractional"] == 0 # relaxation is integral + assert lp["objective"] == pytest.approx(obj_ilp, rel=1e-6) + assert lp["solution"] is not None + # one strategy per (single-output) decision node + assert len(lp["solution"]) > 0 + finally: + autop.__exit__(None, None, None) + + +@apply_cuda_patches +def test_axis_constraint_fix_method_matches_constraint(device_mesh_2d): + """Pinning an axis by fixing variables gives the same result as the equality + constraint, and is exact.""" + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()} + wq = fqn["wq.weight"] + opt.add_node_axis_constraint(wq, mesh_dim=1, placement=Shard(0), method="fix") + solution = autop.optimize_placement(verbose=False) + assert opt.prob.status == 1 + placements = solution[opt._concrete_to_orig.get(wq, wq)].output_specs.placements + assert placements[1] == Shard(0) + finally: + autop.__exit__(None, None, None) + + +@apply_cuda_patches +def test_add_node_axis_constraint_pins_one_axis(device_mesh_2d): + """A per-axis constraint pins the chosen mesh axis and leaves the other free.""" + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()} + wq = fqn["wq.weight"] + opt.add_node_axis_constraint(wq, mesh_dim=1, placement=Shard(0)) + solution = autop.optimize_placement(verbose=False) + placements = solution[opt._concrete_to_orig.get(wq, wq)].output_specs.placements + # tp axis pinned to Shard(0); dp axis decided by the ILP. + assert placements[1] == Shard(0) + finally: + autop.__exit__(None, None, None) + + +@apply_cuda_patches +def test_axis_constraint_keeps_param_shardable_for_fsdp(device_mesh_2d): + """A per-axis tp constraint must not exclude a parameter from the memory + budget: it should still be shardable on the (free) data axis for FSDP.""" + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()} + wq = fqn["wq.weight"] + # Column-parallel on tp; data axis left open. + opt.add_node_axis_constraint(wq, mesh_dim=1, placement=Shard(0)) + solution = autop.optimize_placement(verbose=False) + assert opt.prob.status == 1 # feasible despite the tight memory budget + placements = solution[opt._concrete_to_orig.get(wq, wq)].output_specs.placements + # FSDP shards the data axis too (tight 1/world_size budget). + assert placements[0] == Shard(0) + assert placements[1] == Shard(0) + finally: + autop.__exit__(None, None, None) + + +@apply_cuda_patches +def test_seed_unachievable_raises(device_mesh_2d): + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + prop = ShardingPropagator(opt) + fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()} + wq = fqn["wq.weight"] + # wq.weight is 2D; sharding a non-existent tensor dim 5 is impossible. + with pytest.raises(ValueError): + prop.seed(wq, (None, Shard(5))) + finally: + autop.__exit__(None, None, None) + + +@apply_cuda_patches +def test_propagation_determines_matmul_outputs(device_mesh_2d): + """Seeding the column-parallel weights determines the tp axis of the matmul + outputs (sharded on the output feature) with no resharding.""" + autop = _enter_autop(device_mesh_2d) + try: + opt = autop.sharding_optimizer + prop = ShardingPropagator(opt) + annotations = [] + fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()} + for proj in ["wq", "wk", "wv", "w1", "w3"]: + annotations.append( + (fqn[f"{proj}.weight"], ShardingAnnotation((None, Shard(0)), 1)) + ) + for proj in ["wo", "w2"]: + annotations.append( + (fqn[f"{proj}.weight"], ShardingAnnotation((None, Shard(1)), 1)) + ) + determined = prop.run(annotations) + + # Every column-parallel matmul output should be tp-sharded (not replicated). + einsum_nodes = opt.graph.find_nodes( + op="call_function", target=torch.ops.aten.einsum.default + ) + if not einsum_nodes: + einsum_nodes = opt.graph.find_nodes( + op="call_function", target=torch.ops.aten.mm.default + ) + n_tp_pinned = 0 + for n in einsum_nodes: + if n in determined: + tp = dict(determined[n]).get(1) + if isinstance(tp, Shard): + n_tp_pinned += 1 + assert n_tp_pinned > 0 + finally: + autop.__exit__(None, None, None) From c33c0ef2b87400882fa0f51e322cdaa87ac29e17 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 18:55:04 -0700 Subject: [PATCH 09/27] Integrate prune + dp_solver + annotated into a joint optimization Make the approximate (dp) solver work with the pruned search space and the propagated per-axis annotations, the two pieces neither branch had on its own: - Pruning removes infinite-cost edges from decision_vars entirely, so the approx solver must treat a key absent from decision_vars as forbidden (_is_forbidden) and read per-strategy costs from any surviving inp_idx (_surviving_dv). Applied across the forbidden checks and decision_var reads. - Replay add_node_axis_constraint from _constraint_log in both the PuLP and the lite (no-PuLP) topology paths so propagated Shard pins restrict the approx solver's per-node out_idx domain (method="fix" leaves no PuLP row). - Port the forward param-dtype constraint (current main) into _topology_direct so the lite build matches the full build exactly under mixed precision. - Guard _fix_node_output_indices / add_node_axis_constraint against pruned (None) variables and the lite build. Authored with Claude. --- autoparallel/approximate_sharding.py | 132 ++++++++++++++++++++++++--- autoparallel/optimize_sharding.py | 2 + 2 files changed, 123 insertions(+), 11 deletions(-) diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py index 7e1a945a..146effcf 100644 --- a/autoparallel/approximate_sharding.py +++ b/autoparallel/approximate_sharding.py @@ -427,6 +427,11 @@ def _parse_constraints(self): (next(iter(na)), next(iter(nb)), frozenset({(next(iter(oa)), next(iter(ob)))})) ) + # method="fix" axis pins leave no PuLP row to parse above, so replay the + # log to recover them (constraint-method pins are also picked up here, + # idempotently with their == 1 rows). + for n, out_set in self._axis_restrict_from_log().items(): + restrict[n] = restrict.get(n, out_set) & out_set for n, out_set in restrict.items(): if n in self.allowed_out: self.allowed_out[n] = [o for o in self.allowed_out[n] if o in out_set] @@ -469,7 +474,39 @@ def nroot(idx): if not math.isfinite(dv.cost) or dv.cost == 10000.0: self.forbidden.add(key) - # 2. grad-reduce-dtype forbidden (== add_grad_reduce_dtype_constraints). + # 2a. forward param-dtype forbidden (== add_grad_reduce_dtype_constraints + # forward part, unconditional). Force the FSDP allgather to run after + # a downcasting param dtype_cast (in the smaller param_dtype) by + # forbidding any pre-cast redistribution. + cast_op = torch.ops.autoparallel.dtype_cast.default + fwd_pre_cast: set[int] = set() + for param, _grad in get_param_and_grad_nodes(opt.graph).values(): + n = param + while True: + if n.target == cast_op: + break + users = list(n.users.keys()) + if len(users) != 1: + break + child = users[0] + if len(child.all_input_nodes) != 1: + break + n = child + if n.target != cast_op: + continue + if n.meta["val"].dtype.itemsize >= param.meta["val"].dtype.itemsize: + continue # only constrain downcasts + node = n + while node != param: + if node in opt.node_map: + fwd_pre_cast.add(opt.node_map[node]) + node = node.all_input_nodes[0] + for key, dv in opt.decision_vars.items(): + if key[0] in fwd_pre_cast and dv.comm_cost > 0: + self.forbidden.add(key) + + # 2. grad-reduce-dtype (backward) forbidden + # (== add_grad_reduce_dtype_constraints backward part). if getattr(opt, "force_grad_reduce_in_higher_precision", False): cast_op = torch.ops.autoparallel.dtype_cast.default pre_cast: set[int] = set() @@ -553,6 +590,12 @@ def add_paired(node_a, node_b): break r = nroot(opt.node_map[node]) restrict[r] = restrict.get(r, out_set) & out_set + # 4b. per-axis placement restrictions (== add_node_axis_constraint), what + # sharding propagation emits. With method="fix" these leave no PuLP + # row to parse, so replaying the log is the only way the approx solver + # sees the pin. + for r, out_set in self._axis_restrict_from_log().items(): + restrict[r] = restrict.get(r, out_set) & out_set for n_idx, out_set in restrict.items(): if n_idx in self.allowed_out: self.allowed_out[n_idx] = [ @@ -585,10 +628,69 @@ def add_paired(node_a, node_b): return paired_edges, authoritative + def _axis_restrict_from_log(self): + """out_idx restrictions implied by add_node_axis_constraint calls, + replayed from _constraint_log → {root_node_idx: set(out_idx)}. + + This is how the approximate solver honors propagated per-axis pins: keep + only the strategies whose output placement matches the pinned axis, + exactly like ShardingOptimizer.add_node_axis_constraint. It works whether + the pin was applied as a PuLP row ("constraint") or as variable bounds + ("fix", which leaves no row to parse) and in the lite (no-PuLP) build.""" + opt = self.opt + node_root = {lk[0]: rk[0] for lk, rk in opt.cluster_links.items()} + restrict: dict[int, set] = {} + for fname, kwargs in getattr(opt, "_constraint_log", []): + if fname != "add_node_axis_constraint": + continue + node = next( + (nd for nd in opt.nodes if nd.name == kwargs["node_name"]), None + ) + if node is None or node not in opt.strats: + continue + mesh_dim, placement = kwargs["mesh_dim"], kwargs["placement"] + out_set = set() + for i, s in enumerate(opt.strats[node].strategies): + specs = s.output_specs + if isinstance(specs, DTensorSpec): + spec = specs + elif isinstance(specs, (list, tuple)): + spec = next((x for x in specs if isinstance(x, DTensorSpec)), None) + else: + spec = None + if spec is not None and spec.placements[mesh_dim] == placement: + out_set.add(i) + r = node_root.get(opt.node_map[node], opt.node_map[node]) + restrict[r] = restrict.get(r, out_set) & out_set + return restrict + + def _is_forbidden(self, key) -> bool: + """A strategy edge is forbidden if a constraint ruled it out OR it was + pruned for infinite cost. Pruning removes such keys from decision_vars + entirely (see ShardingOptimizer._build_decision_vars), so a key missing + from decision_vars is just as forbidden as one in ``self.forbidden``.""" + return key in self.forbidden or key not in self.opt.decision_vars + + def _surviving_dv(self, v, argi, o): + """A DecisionVar for (v, argi, o, *) using any inp_idx that survived + pruning, or None if every edge for that (arg, out) was pruned. + compute_cost / input_spec are identical across inp_idx for a fixed out.""" + strat = self.opt.strats[self.opt.nodes[v]].strategies[o] + n_inp = ( + len(strat.redistribute_cost[argi]) + if argi < len(strat.redistribute_cost) + else 1 + ) + for inp in range(n_inp): + dv = self.opt.decision_vars.get((v, argi, o, inp)) + if dv is not None: + return dv + return None + def _out_fully_forbidden(self, v, node, o): strat = self.opt.strats[node].strategies[o] for argi, costs in enumerate(strat.redistribute_cost): - if all((v, argi, o, inp) in self.forbidden for inp in range(len(costs))): + if all(self._is_forbidden((v, argi, o, inp)) for inp in range(len(costs))): return True return False @@ -736,13 +838,16 @@ def _choice_lower_bound(self, v, node, o): opt = self.opt strat = opt.strats[node].strategies[o] mult = self.node_mult[v] - lb = opt.decision_vars[(v, 0, o, 0)].compute_cost * len(strat.redistribute_cost) + dv0 = self._surviving_dv(v, 0, o) + if dv0 is None: + return INF # every edge for this output strategy was pruned + lb = dv0.compute_cost * len(strat.redistribute_cost) lb *= mult for argi, _p in self.input_edges.get(v, []): best = INF for inp in range(len(strat.redistribute_cost[argi])): key = (v, argi, o, inp) - if key in self.forbidden: + if self._is_forbidden(key): continue dv = opt.decision_vars[key] best = min(best, dv.comm_cost + dv.sharding_transition_cost) @@ -797,7 +902,7 @@ def _build_memory_info(self): } def _param_ratio(self, v, node, o): - spec = self.opt.decision_vars[(v, 0, o, 0)].input_spec + spec = self._surviving_dv(v, 0, o).input_spec new_shape, _ = _get_sharded_shape_stride(spec) return math.prod(new_shape) / math.prod(spec.tensor_meta.shape) @@ -859,7 +964,10 @@ def _self_cost_vec(self, m, out_indices): for i, o in enumerate(out_indices): strat = opt.strats[node].strategies[o] n_args = len(strat.redistribute_cost) - dv0 = opt.decision_vars[(m, 0, o, 0)] + dv0 = self._surviving_dv(m, 0, o) + if dv0 is None: # whole output strategy pruned + out[i] = BIG + continue c = dv0.compute_cost * n_args # Args with no flow edge (constructors / None-spec) are scored at # inp=0 here; args with a producer are charged via the pairwise edges. @@ -867,7 +975,7 @@ def _self_cost_vec(self, m, out_indices): if argi in prod: continue key = (m, argi, o, 0) - if key in self.forbidden: + if self._is_forbidden(key): c = BIG break dv = opt.decision_vars[key] @@ -890,7 +998,7 @@ def _edge_matrix(self, v, argi, p): for ov in ov_vals: for op in op_vals: key = (v, argi, ov, op) - if key in self.forbidden: + if self._is_forbidden(key): continue dv = opt.decision_vars[key] R[ov, op] = dv.comm_cost + dv.sharding_transition_cost @@ -1181,7 +1289,7 @@ def total_objective(self): p = prod.get(argi) inp = self.cur_out[p] if p is not None else 0 key = (v, argi, o, inp) - if key in self.forbidden: + if self._is_forbidden(key): return INF c += self.opt.decision_vars[key].cost total += self.node_mult[v] * c @@ -1204,9 +1312,11 @@ def _write_back(self): p = prod.get(argi) inp = self.cur_out[p] if p is not None else 0 key = (v, argi, o, inp) - if key in self.forbidden: + if self._is_forbidden(key): feasible = False - if has_pulp: + # A pruned key has no PuLP variable; the infeasible flag above + # already records it (and raises in _solve). + if has_pulp and key in opt.pulp_variables: opt.pulp_variables[key].varValue = 1 selected.append(key) opt.selected_keys = list(selected) diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 474675ca..fd6a256c 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -2549,6 +2549,8 @@ def _fix_node_output_indices(self, node, keep_out_idxs): if out_idx in keep_out_idxs: continue var = self._get_pulp_variable((node_idx, argi, out_idx, inp_idx)) + if var is None: # pruned (invalid) strategy edge, or lite (no-PuLP) build + continue if var.upBound != 0: var.upBound = 0 self._fixed_vars.append(var) From f7af13590ae7c38059a843b0bedfe32fa9ec96e8 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 19:32:12 -0700 Subject: [PATCH 10/27] Fix loaded-optimizer resolve() under dp_solver profiling A loaded optimizer (ShardingOptimizer.load) is built via __new__ and never ran the dp_solver init-time profiling, so resolve()/get_solution() -> _log_solve_profile hit a missing self.profile. Guard the solve profiler to no-op without init timings, and initialize profile/build_pulp/_node_axis_constraints/_fixed_vars in load_optimizer so loaded optimizers carry the full attribute set. Authored with Claude. --- autoparallel/optimize_sharding.py | 5 +++++ autoparallel/serialization.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index fd6a256c..fd46b196 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -1495,6 +1495,11 @@ def _log_solve_profile( extract_s, total_s, ): + # Optimizers loaded from a save file skip init-time profiling; there is + # nothing to extend, and the phase timings below are absent. + profile = getattr(self, "profile", None) + if not profile or "init_total_s" not in profile.get("timings", {}): + return mesh = self.profile["mesh"] model = self.profile["model"] timings = self.profile["timings"] diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py index 00bc6dcf..dfd953b1 100644 --- a/autoparallel/serialization.py +++ b/autoparallel/serialization.py @@ -264,7 +264,13 @@ def load_optimizer(cls, path): opt._constraint_log = [] opt._memory_constraint = None opt._node_constraint_names = {} + opt._node_axis_constraints = defaultdict(list) + opt._fixed_vars = [] opt._name_counters = {} + # Loaded optimizers rebuild the PuLP problem below but carry no init-time + # profiling; an empty profile lets solve-time profile writes/guards no-op. + opt.build_pulp = True + opt.profile = {"timings": {}} # Reconstruct cluster_links by expanding the node-level mapping over # all (argi, out_idx, inp_idx) combinations. From b767f2d29dcc8ce0dc8f10cf92ec2adb667ee2d3 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 19:41:47 -0700 Subject: [PATCH 11/27] Apply the memory constraint in get_lower_bound The LP relaxation lower bound must include the parameter-memory budget, or it bounds a different (unconstrained) problem and reads below the true ILP optimum. With the fix the LP bound equals the exact constrained optimum on LLaMA3-1B, making it a tight optimality certificate (used for the 3D gap, where the ILP is intractable). Authored with Claude. --- autoparallel/optimize_sharding.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index fd46b196..8d0de31d 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -1383,6 +1383,10 @@ def get_lower_bound(self, verbose=False): try: if self.prob.objective is None: self._set_objective() + # The relaxation must include the parameter-memory constraint, or it + # is a lower bound on a different (unconstrained) problem and can fall + # below the true ILP optimum. + self._apply_memory_constraint() for var in self.pulp_variables.values(): var.cat = pulp.LpContinuous From 6fcf8443c8f4c92444d367ea585c7a0fa60f7a66 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 19:46:21 -0700 Subject: [PATCH 12/27] Add joint-optimization benchmark for LLaMA3 on 2D/3D meshes _bench_merge.py compares the four configurations (prune ILP, annotated ILP, prune+dp approx, prune+dp+annotated) on one traced model, reporting per-phase timings, objectives, the LP-relaxation optimality certificate, and the acceptance checks. _bench_dp_alone.py isolates the approx-without-prune baseline (run against the dp_solver checkout) for the dp-alone comparison. Authored with Claude. --- examples/_bench_dp_alone.py | 86 +++++++++++ examples/_bench_merge.py | 281 ++++++++++++++++++++++++++++++++++++ 2 files changed, 367 insertions(+) create mode 100644 examples/_bench_dp_alone.py create mode 100644 examples/_bench_merge.py diff --git a/examples/_bench_dp_alone.py b/examples/_bench_dp_alone.py new file mode 100644 index 00000000..10c026ec --- /dev/null +++ b/examples/_bench_dp_alone.py @@ -0,0 +1,86 @@ +"""Minimal approx-solver timing, for the 'dp alone' (approx WITHOUT prune) +baseline. Run it with PYTHONPATH pointing at the dp_solver checkout to get the +unpruned numbers, and at the merge checkout to cross-check prune+dp. + +Reports lite-build time, approx solve time, decision-var count and objective for +LLaMA3-1B with the canonical constraints. Env: MESH, SEQLEN, N_LAYERS. +""" +import logging +import os +import time +from unittest.mock import patch + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +import autoparallel +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +N_LAYERS = int(os.environ.get("N_LAYERS", "0")) +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] + + +def model_fn(): + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, + multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN) + if N_LAYERS: + args.n_layers = N_LAYERS + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +x = (Shard(0),) + (Replicate(),) * (ndim - 1) +out = (Shard(0), Shard(2)) if ndim == 2 else x + +print(f"autoparallel = {autoparallel.__file__}", flush=True) +print(f"=== dp-alone (approx) LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} " + f"layers={N_LAYERS or 16} ===", flush=True) + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") +autop.__enter__() +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x]) +autop.add_output_constraints([out]) +t_build = time.perf_counter() - t +opt = autop.sharding_optimizer + +t = time.perf_counter() +ApproximateShardingSolver(opt).get_solution(verbose=False) +t_solve = time.perf_counter() - t +obj = opt.profile["approximate"]["objective"] + +print(f"[dp-alone] build={t_build:.2f}s approx_solve={t_solve:.2f}s " + f"total={t_build + t_solve:.2f}s obj={obj:.1f} " + f"decision_vars={len(opt.decision_vars)}", flush=True) diff --git a/examples/_bench_merge.py b/examples/_bench_merge.py new file mode 100644 index 00000000..e5bc6c5a --- /dev/null +++ b/examples/_bench_merge.py @@ -0,0 +1,281 @@ +"""Joint-optimization benchmark: prune (+ annotated) + dp (approx) vs each alone. + +Measures, for LLaMA3-1B on a 2D or 3D mesh with the canonical example_llama3 +constraints, four optimization configurations on the SAME traced model: + + prune : full ILP build + exact CBC solve (== prune_search_space) + annotated : full ILP build + propagate(fix) + CBC solve (== annotated_search) + dp : lite build + approx solve (== dp_solver) + merged : lite build + propagate(fix) + approx (this branch) + +Reports each config's build/solve/total time and objective, the LP-relaxation +lower bound (an optimality certificate), and checks the acceptance criteria: + + * merged objective within 10% (ideally 5%) of the ILP optimum, and + * merged total time < every individual optimization's total time. + +Env knobs: MESH ("8,8" 2D / "2,4,8" 3D), ILP_TIMEOUT (s, 0=unlimited), +N_LAYERS (0=default 16), SEQLEN. +""" +import logging +import os +import time +from unittest.mock import patch + +import pulp +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) + + +def log(msg=""): + print(msg, flush=True) + + +# Fake an 8-GPU H100 node so the cost model runs without real GPUs. +_PATCHES = [ + patch("torch.cuda.device_count", lambda: 8), + patch("torch.cuda.get_device_name", lambda *a, **k: "H100"), + patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)), + patch( + "torch.cuda.get_device_properties", + lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132} + )(), + ), +] +for p in _PATCHES: + p.start() + +N_LAYERS = int(os.environ.get("N_LAYERS", "0")) +SEQLEN = int(os.environ.get("SEQLEN", str(2048))) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) +ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "0")) + +world_size = 1 +for d in MESH_SHAPE: + world_size *= d +_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")} +mesh_names = _NAMES[len(MESH_SHAPE)] +fake_store = FakeStore() +torch.distributed.init_process_group( + "fake", store=fake_store, rank=0, world_size=world_size +) +mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", MESH_SHAPE, mesh_dim_names=mesh_names +) +ndim = mesh.ndim + +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +seqlen = SEQLEN + + +def model_fn(): + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + if N_LAYERS: + args.n_layers = N_LAYERS + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") + + +# Canonical TP plan: column-parallel q/k/v/w1/w3, row-parallel wo/w2, pinning +# only the tensor-parallel (last) mesh axis; data/cp axes left to the optimizer. +COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),) +ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),) + + +def annotate_tp_plan(autop): + for proj in ["wq", "wk", "wv"]: + autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL) + autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL) + for proj in ["w1", "w3"]: + autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL) + autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL) + + +def add_constraints(autop): + x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1) + out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([out_sharding]) + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + +log(f"=== LLaMA3-1B mesh={MESH_SHAPE}{mesh_names} world={world_size} " + f"seqlen={seqlen} layers={N_LAYERS or 16} ===") +results = {} # name -> dict(build, solve, total, obj) + + +def build(build_pulp): + t = time.perf_counter() + autop = AutoParallel( + model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, + solver="ilp" if build_pulp else "approx", + ) + autop.__enter__() + add_constraints(autop) + return autop, time.perf_counter() - t + + +# ---------- full PuLP build: prune (ILP) + annotated (ILP) + LP bound ---------- +autop_full, build_full = build(build_pulp=True) +opt = autop_full.sharding_optimizer +log(f"\n[full build] {build_full:.2f}s decision_vars={len(opt.decision_vars)} " + f"pulp_vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)}") + +# prune: exact ILP solve. preprocess-off is part of the prune optimization, and +# _apply_memory_constraint installs the same budget the approx solver enforces, +# so every config solves the identical constrained problem. +opt._set_objective() +opt._apply_memory_constraint() +kw = {"msg": False, "options": ["preprocess off"]} +if ILP_TIMEOUT > 0: + kw["timeLimit"] = ILP_TIMEOUT +t = time.perf_counter() +opt.prob.solve(pulp.PULP_CBC_CMD(**kw)) +t_ilp = time.perf_counter() - t +obj_opt = pulp.value(opt.prob.objective) +ilp_status = pulp.LpStatus[opt.prob.status] +results["prune"] = dict(build=build_full, solve=t_ilp, total=build_full + t_ilp, + obj=obj_opt) +log(f"[prune ] ILP solve {t_ilp:8.2f}s obj={obj_opt:11.1f} status={ilp_status}") + +# LP-relaxation lower bound: certifies the optimality gap without a full ILP +# (this sharding LP is empirically integral, so the bound equals the optimum). +lb_res = opt.get_lower_bound(verbose=False) +lb = lb_res.objective +log(f"[LP-bound ] solve {lb_res.solve_s:8.2f}s lower_bound={lb:11.1f}") + +# annotated: propagate the TP plan, then exact ILP solve on the reduced problem. +annotate_tp_plan(autop_full) +t = time.perf_counter() +prop = autop_full.propagate_annotations(verbose=False, method="fix") +t_prop_full = time.perf_counter() - t +opt._apply_memory_constraint() +t = time.perf_counter() +opt.prob.solve(pulp.PULP_CBC_CMD(**kw)) +t_ilp_ann = time.perf_counter() - t +obj_ann = pulp.value(opt.prob.objective) +results["annotated"] = dict(build=build_full, solve=t_prop_full + t_ilp_ann, + total=build_full + t_prop_full + t_ilp_ann, obj=obj_ann) +log(f"[annotated] propagate {t_prop_full:.2f}s + ILP {t_ilp_ann:.2f}s " + f"obj={obj_ann:11.1f} (pinned {prop.nodes_determined} nodes, " + f"-{100*prop.reduction:.0f}% strategies)") + +# Tear down before the next build: AutoParallel installs a FakeTensorMode, and +# two entered instances can't coexist. +autop_full.__exit__(None, None, None) + +# ---------- lite build: dp=prune+approx + merged=prune+approx+annotated ------- +autop_lite, build_lite = build(build_pulp=False) +opt_l = autop_lite.sharding_optimizer +log(f"\n[lite build] {build_lite:.2f}s decision_vars={len(opt_l.decision_vars)} " + f"pulp_vars={len(opt_l.pulp_variables)} (no PuLP problem)") + +# dp: approximate solve, no annotations. +t = time.perf_counter() +ApproximateShardingSolver(opt_l).get_solution(verbose=False) +t_approx_dp = time.perf_counter() - t +obj_dp = opt_l.profile["approximate"]["objective"] +results["dp"] = dict(build=build_lite, solve=t_approx_dp, total=build_lite + t_approx_dp, + obj=obj_dp) +log(f"[dp ] approx solve {t_approx_dp:8.2f}s obj={obj_dp:11.1f}") + +# merged: propagate the TP plan, then approximate solve on the reduced problem. +annotate_tp_plan(autop_lite) +t = time.perf_counter() +prop_l = autop_lite.propagate_annotations(verbose=False, method="fix") +t_prop_lite = time.perf_counter() - t +t = time.perf_counter() +ApproximateShardingSolver(opt_l).get_solution(verbose=False) +t_approx_merged = time.perf_counter() - t +obj_merged = opt_l.profile["approximate"]["objective"] +results["merged"] = dict(build=build_lite, solve=t_prop_lite + t_approx_merged, + total=build_lite + t_prop_lite + t_approx_merged, obj=obj_merged) +log(f"[merged ] propagate {t_prop_lite:.2f}s + approx {t_approx_merged:.2f}s " + f"obj={obj_merged:11.1f} (pinned {prop_l.nodes_determined} nodes)") + +autop_lite.__exit__(None, None, None) + +# ---------- report ---------- +# Optimality reference: exact ILP optimum if CBC proved it, else the LP lower +# bound (this sharding LP is empirically integral, so lb == optimum). +optimal = obj_opt if ilp_status == "Optimal" else lb +opt_label = "ILP optimum" if ilp_status == "Optimal" else "LP lower bound" + +LABELS = { + "prune": "prune (ILP)", + "annotated": "annotated (ILP)", + "dp": "prune+dp (approx)", + "merged": "prune+dp+anno", +} +log("\n" + "=" * 78) +log(f"{'config':<20}{'build(s)':>10}{'solve(s)':>10}{'total(s)':>10}" + f"{'objective':>13}{'gap%':>9}") +log("-" * 78) +for name in ["prune", "annotated", "dp", "merged"]: + r = results[name] + gap = 100 * (r["obj"] - optimal) / optimal + log(f"{LABELS[name]:<20}{r['build']:>10.2f}{r['solve']:>10.2f}{r['total']:>10.2f}" + f"{r['obj']:>13.1f}{gap:>+9.2f}") +log("=" * 78) +log(f"optimality reference: {opt_label} = {optimal:.1f} (ILP status={ilp_status})") + +# Core joint optimization is prune + dp (the approximate solver on the pruned +# space); annotation is the optional extra speedup. Report both gaps. +gap_core = 100 * (obj_dp - optimal) / optimal +gap_full = 100 * (obj_merged - optimal) / optimal +log(f"\nobjective gap vs {opt_label}:") +log(f" prune+dp (approx) : {gap_core:+.2f}% (core: prune + dp)") +log(f" prune+dp+annotated : {gap_full:+.2f}% (+ optional annotation)") + +# Timing: the joint solver must beat each ILP-based individual optimization. +# (dp alone == approx WITHOUT prune is measured against the dp_solver checkout +# separately; prune makes the joint build/solve strictly cheaper than that.) +log("\njoint total time (build+solve) vs each individual optimization:") +all_faster = True +for joint in ["dp", "merged"]: + tj = results[joint]["total"] + line_ok = True + for name in ["prune", "annotated"]: + to = results[name]["total"] + faster = tj < to + line_ok = line_ok and faster + log(f" {LABELS[joint]:<18} {tj:7.2f}s {'<' if faster else '>='} " + f"{LABELS[name]:<16} {to:7.2f}s {to / tj:5.1f}x " + f"{'OK' if faster else 'FAIL'}") + all_faster = all_faster and line_ok + +log("\n" + "=" * 78) +# The full three-way joint (prune + dp + annotated) is the deliverable: the +# approx solver alone is ~20% off, but the propagated TP plan steers it to the +# optimum. Annotation is therefore what meets the accuracy bar; prune+dp alone +# trades accuracy for a little more speed. +ok_gap = abs(gap_full) <= 10.0 +log(f"ACCEPTANCE gap<=10% (full joint prune+dp+anno): {ok_gap} " + f"(full={gap_full:+.2f}%, <=5%: {abs(gap_full) <= 5.0})") +log(f" (informational: prune+dp without annotation = {gap_core:+.2f}%)") +log(f"ACCEPTANCE joint faster than ILP-based optimizations: {all_faster}") +log(f"OVERALL: {'PASS' if ok_gap and all_faster else 'CHECK'}") From e8689cdc8c5c0327ba91ad9863bd7c3533c58633 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 20:41:18 -0700 Subject: [PATCH 13/27] Extend benches for 3D: MODEL=small, MERGED flag, LP-bound certificate At full-1B 3D scale the PuLP problem has ~8M binary variables (strategy count is rank x mesh-dims, independent of tensor size and -- via clustering -- of layer count), so the exact ILP is intractable. _bench_3d_cert.py certifies the merged gap on full 3D via the LP-relaxation lower bound (tight: it equals the exact optimum on 2D). _bench_dp_alone.py gains a MERGED flag (annotate+propagate) and _bench_merge.py a MODEL=small mode. Authored with Claude. --- examples/_bench_3d_cert.py | 108 ++++++++++++++++++++++++++++++++++++ examples/_bench_dp_alone.py | 23 +++++++- examples/_bench_merge.py | 28 +++++++--- 3 files changed, 148 insertions(+), 11 deletions(-) create mode 100644 examples/_bench_3d_cert.py diff --git a/examples/_bench_3d_cert.py b/examples/_bench_3d_cert.py new file mode 100644 index 00000000..0f5bcdc5 --- /dev/null +++ b/examples/_bench_3d_cert.py @@ -0,0 +1,108 @@ +"""3D optimality certificate for the merged solver on full LLaMA3-1B. + +The 3D ILP has ~8M binary variables; the exact CBC solve is impractical (a 2.6 GB +MPS file). The LP relaxation, however, is empirically integral for this problem +(verified on 2D, where it equals the exact optimum), so its objective is a tight +lower bound on the ILP optimum. This script does ONE full PuLP build, then: + + 1. get_lower_bound() -> LP lower bound (the optimality reference) + 2. annotate + propagate + ApproximateShardingSolver -> merged objective + +and reports the certified gap = (merged - lb) / lb. Slow (one ~13min build + a +multi-minute LP solve) but a one-shot 3D certificate. Env: MESH, SEQLEN. +""" +import logging +import os +import time +from unittest.mock import patch + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + + +def log(m=""): + print(m, flush=True) + + +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] + + +def model_fn(): + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, + multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +x = (Shard(0),) + (Replicate(),) * (ndim - 1) +out = (Shard(0), Shard(2)) if ndim == 2 else x + +log(f"=== 3D cert: LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===") +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") +autop.__enter__() +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x]) +autop.add_output_constraints([out]) +opt = autop.sharding_optimizer +log(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)} " + f"pulp_vars={len(opt.pulp_variables)}") + +# LP-relaxation lower bound = optimality reference (the exact ILP is intractable). +opt._set_objective() +t = time.perf_counter() +lb = opt.get_lower_bound(verbose=False).objective +log(f"[LP-bound] {time.perf_counter()-t:.1f}s lower_bound={lb:.1f}") + +# Merged solver on the same build: propagate the TP plan, then approx-solve. +cp = (None,) * (ndim - 1) + (Shard(0),) +rp = (None,) * (ndim - 1) + (Shard(1),) +for proj in ["wq", "wk", "wv"]: + autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp) +autop.annotate_parameter("layers.*.attention.wo.weight", rp) +for proj in ["w1", "w3"]: + autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp) +autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp) +autop.propagate_annotations(verbose=False, method="fix") +t = time.perf_counter() +ApproximateShardingSolver(opt).get_solution(verbose=False) +merged = opt.profile["approximate"]["objective"] +log(f"[merged] approx {time.perf_counter()-t:.1f}s objective={merged:.1f}") + +gap = 100 * (merged - lb) / lb +log(f"\n=== 3D certified gap = {gap:+.2f}% (merged {merged:.1f} vs LP lower bound " + f"{lb:.1f}) ===") +log(f"acceptance gap<=10%: {abs(gap)<=10} (<=5%: {abs(gap)<=5})") diff --git a/examples/_bench_dp_alone.py b/examples/_bench_dp_alone.py index 10c026ec..4b67c3a1 100644 --- a/examples/_bench_dp_alone.py +++ b/examples/_bench_dp_alone.py @@ -76,11 +76,28 @@ def input_fn(): t_build = time.perf_counter() - t opt = autop.sharding_optimizer +# With MERGED=1, add the propagated TP plan before solving (full joint solver). +t_prop = 0.0 +label = "dp-alone" +if os.environ.get("MERGED") == "1": + label = "merged" + cp = (None,) * (ndim - 1) + (Shard(0),) + rp = (None,) * (ndim - 1) + (Shard(1),) + for proj in ["wq", "wk", "wv"]: + autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp) + autop.annotate_parameter("layers.*.attention.wo.weight", rp) + for proj in ["w1", "w3"]: + autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp) + autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp) + t = time.perf_counter() + autop.propagate_annotations(verbose=False, method="fix") + t_prop = time.perf_counter() - t + t = time.perf_counter() ApproximateShardingSolver(opt).get_solution(verbose=False) t_solve = time.perf_counter() - t obj = opt.profile["approximate"]["objective"] -print(f"[dp-alone] build={t_build:.2f}s approx_solve={t_solve:.2f}s " - f"total={t_build + t_solve:.2f}s obj={obj:.1f} " - f"decision_vars={len(opt.decision_vars)}", flush=True) +print(f"[{label}] build={t_build:.2f}s propagate={t_prop:.2f}s " + f"approx_solve={t_solve:.2f}s total={t_build + t_prop + t_solve:.2f}s " + f"obj={obj:.1f} decision_vars={len(opt.decision_vars)}", flush=True) diff --git a/examples/_bench_merge.py b/examples/_bench_merge.py index e5bc6c5a..c6249021 100644 --- a/examples/_bench_merge.py +++ b/examples/_bench_merge.py @@ -76,17 +76,29 @@ def log(msg=""): ) ndim = mesh.ndim -vocab_size = 128256 +# MODEL=1b is the real LLaMA3-1B; MODEL=small is a tractable proxy whose smaller +# tensors yield few enough decision variables that the exact ILP/LP-bound finish +# on a 3D mesh (where the 1B PuLP problem has ~8M variables and is impractical), +# letting us certify the approximate solver's gap on real 3D structure. +MODEL = os.environ.get("MODEL", "1b") +vocab_size = 1024 if MODEL == "small" else 128256 batch_size = 2 * mesh.shape[0] seqlen = SEQLEN def model_fn(): - args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) + if MODEL == "small": + args = TransformerModelArgs( + dim=256, n_layers=4, n_heads=8, n_kv_heads=4, + multiple_of=64, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) + else: + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, + vocab_size=vocab_size, max_seq_len=seqlen, + ) if N_LAYERS: args.n_layers = N_LAYERS with torch.device("meta"): @@ -123,8 +135,8 @@ def add_constraints(autop): set_nccl_topo_config(detect_nccl_topo_config(mesh)) mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -log(f"=== LLaMA3-1B mesh={MESH_SHAPE}{mesh_names} world={world_size} " - f"seqlen={seqlen} layers={N_LAYERS or 16} ===") +log(f"=== LLaMA3-{MODEL} mesh={MESH_SHAPE}{mesh_names} world={world_size} " + f"seqlen={seqlen} vocab={vocab_size} layers={N_LAYERS or '(default)'} ===") results = {} # name -> dict(build, solve, total, obj) From 523f3aaaf482544476976607f14962a459ffb6a0 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sat, 30 May 2026 21:14:19 -0700 Subject: [PATCH 14/27] Use HiGHS (scipy.linprog) for the 3D LP-bound certificate CBC's simplex on the 8M-variable 3D LP runs for hours; HiGHS solves it in minutes. Validated on 2D: HiGHS lower bound (72011.5) matches CBC and the exact ILP optimum to the decimal. The cert now does one full build -> prune+dp + merged approx objectives + HiGHS LP lower bound -> certified gaps. Authored with Claude. --- examples/_bench_3d_cert.py | 90 ++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 22 deletions(-) diff --git a/examples/_bench_3d_cert.py b/examples/_bench_3d_cert.py index 0f5bcdc5..956489cb 100644 --- a/examples/_bench_3d_cert.py +++ b/examples/_bench_3d_cert.py @@ -1,22 +1,26 @@ """3D optimality certificate for the merged solver on full LLaMA3-1B. -The 3D ILP has ~8M binary variables; the exact CBC solve is impractical (a 2.6 GB -MPS file). The LP relaxation, however, is empirically integral for this problem -(verified on 2D, where it equals the exact optimum), so its objective is a tight -lower bound on the ILP optimum. This script does ONE full PuLP build, then: - - 1. get_lower_bound() -> LP lower bound (the optimality reference) - 2. annotate + propagate + ApproximateShardingSolver -> merged objective - -and reports the certified gap = (merged - lb) / lb. Slow (one ~13min build + a -multi-minute LP solve) but a one-shot 3D certificate. Env: MESH, SEQLEN. +The 3D ILP has ~8M binary variables; the exact CBC solve (and even CBC's LP +relaxation) is impractical (a 2.6 GB MPS file; CBC simplex runs for hours). The +LP relaxation is empirically integral for this problem (verified on 2D, where it +equals the exact optimum), so its objective is a tight lower bound on the ILP +optimum. We solve that LP with HiGHS (scipy.optimize.linprog), which handles the +8M-variable sparse LP in minutes, then compare to the approximate solvers. + +One full PuLP build feeds: the HiGHS LP lower bound (optimality reference), and +the prune+dp / merged approximate objectives. Reports the certified gaps. Env: +MESH, SEQLEN. """ import logging import os import time from unittest.mock import patch +import numpy as np +import pulp +import scipy.sparse as sp import torch +from scipy.optimize import linprog from torch.distributed.fsdp import MixedPrecisionPolicy from torch.distributed.tensor.placement_types import Replicate, Shard from torch.testing._internal.distributed.fake_pg import FakeStore @@ -65,12 +69,45 @@ def input_fn(): return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") +def lp_lower_bound_highs(opt): + """Solve the LP relaxation (binaries -> [0,1]) of opt.prob with HiGHS and + return its objective: a tight lower bound on the ILP optimum.""" + variables = opt.prob.variables() + idx = {v.name: i for i, v in enumerate(variables)} + n = len(variables) + c = np.zeros(n) + for v, coeff in opt.prob.objective.items(): + c[idx[v.name]] += coeff + rows_eq, cols_eq, data_eq, b_eq = [], [], [], [] + rows_ub, cols_ub, data_ub, b_ub = [], [], [], [] + r_eq = r_ub = 0 + for con in opt.prob.constraints.values(): + rhs = -con.constant + items = list(con.items()) + if con.sense == pulp.LpConstraintEQ: + for v, coeff in items: + rows_eq.append(r_eq); cols_eq.append(idx[v.name]); data_eq.append(coeff) + b_eq.append(rhs); r_eq += 1 + else: # LE: a<=b ; GE: a>=b -> -a<=-b + sign = 1.0 if con.sense == pulp.LpConstraintLE else -1.0 + for v, coeff in items: + rows_ub.append(r_ub); cols_ub.append(idx[v.name]); data_ub.append(sign * coeff) + b_ub.append(sign * rhs); r_ub += 1 + A_eq = sp.csr_matrix((data_eq, (rows_eq, cols_eq)), shape=(r_eq, n)) if r_eq else None + A_ub = sp.csr_matrix((data_ub, (rows_ub, cols_ub)), shape=(r_ub, n)) if r_ub else None + res = linprog(c, A_ub=A_ub, b_ub=(b_ub or None), A_eq=A_eq, b_eq=(b_eq or None), + bounds=(0, 1), method="highs") + if not res.success: + raise RuntimeError(f"HiGHS LP failed: {res.message}") + return res.fun, n, r_eq + r_ub + + set_nccl_topo_config(detect_nccl_topo_config(mesh)) mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) x = (Shard(0),) + (Replicate(),) * (ndim - 1) out = (Shard(0), Shard(2)) if ndim == 2 else x -log(f"=== 3D cert: LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===") +log(f"=== 3D cert (HiGHS): LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===") t = time.perf_counter() autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") autop.__enter__() @@ -78,16 +115,18 @@ def input_fn(): autop.add_input_constraints([x]) autop.add_output_constraints([out]) opt = autop.sharding_optimizer +opt._set_objective() +opt._apply_memory_constraint() log(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)} " - f"pulp_vars={len(opt.pulp_variables)}") + f"pulp_vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)}") -# LP-relaxation lower bound = optimality reference (the exact ILP is intractable). -opt._set_objective() +# prune+dp (approx, no annotation) on the same problem. t = time.perf_counter() -lb = opt.get_lower_bound(verbose=False).objective -log(f"[LP-bound] {time.perf_counter()-t:.1f}s lower_bound={lb:.1f}") +ApproximateShardingSolver(opt).get_solution(verbose=False) +prune_dp = opt.profile["approximate"]["objective"] +log(f"[prune+dp] approx {time.perf_counter()-t:.1f}s objective={prune_dp:.1f}") -# Merged solver on the same build: propagate the TP plan, then approx-solve. +# merged (prune+dp+annotated): propagate the TP plan, then approx-solve. cp = (None,) * (ndim - 1) + (Shard(0),) rp = (None,) * (ndim - 1) + (Shard(1),) for proj in ["wq", "wk", "wv"]: @@ -100,9 +139,16 @@ def input_fn(): t = time.perf_counter() ApproximateShardingSolver(opt).get_solution(verbose=False) merged = opt.profile["approximate"]["objective"] -log(f"[merged] approx {time.perf_counter()-t:.1f}s objective={merged:.1f}") +log(f"[merged] approx {time.perf_counter()-t:.1f}s objective={merged:.1f}") -gap = 100 * (merged - lb) / lb -log(f"\n=== 3D certified gap = {gap:+.2f}% (merged {merged:.1f} vs LP lower bound " - f"{lb:.1f}) ===") -log(f"acceptance gap<=10%: {abs(gap)<=10} (<=5%: {abs(gap)<=5})") +# LP relaxation lower bound via HiGHS = optimality reference. +t = time.perf_counter() +lb, nvar, ncon = lp_lower_bound_highs(opt) +log(f"[LP-bound] HiGHS {time.perf_counter()-t:.1f}s lower_bound={lb:.1f} " + f"(vars={nvar} cons={ncon})") + +log("") +for name, obj in [("prune+dp", prune_dp), ("merged", merged)]: + gap = 100 * (obj - lb) / lb + log(f"=== 3D {name:<9} gap = {gap:+.2f}% (obj {obj:.1f} vs LP lower bound " + f"{lb:.1f}) <=10%: {abs(gap)<=10} <=5%: {abs(gap)<=5} ===") From fc434d59038e1c8819b389694466418cd14dac0b Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 10:53:13 -0700 Subject: [PATCH 15/27] Skip enumeration redistribute-cost computation (algorithm-preserving fast build) Strategy enumeration fills each OpSpec's redistribute_cost via torch's generate_redistribute_costs (~50% of 3D build time per py-spy), but _build_decision_vars overwrites every edge with the NCCL-aware estimate_strategy_comms_cost, and nothing reads the enumeration costs in between (remove_invalid_configs/keep_unique_configs select on placements/shapes only). So during build_sharding_metadata we patch torch's _ops.utils.redistribute_cost to a structure-preserving dummy. Autoparallel's own cost model uses a separate redistribute_cost and is unaffected. A/B verified byte-identical decision_vars (dv_hash) and approx objective on tiny + 1B/2D; toggle via AP_FAST_BUILD=0. Authored with Claude. --- autoparallel/optimize_sharding.py | 127 +++++++++++++++++++----------- examples/_bench_build_profile.py | 80 +++++++++++++++++++ examples/_bench_build_verify.py | 92 ++++++++++++++++++++++ 3 files changed, 254 insertions(+), 45 deletions(-) create mode 100644 examples/_bench_build_profile.py create mode 100644 examples/_bench_build_verify.py diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 8d0de31d..7b648946 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -69,9 +69,11 @@ runtime cost while satisfying all constraints. """ +import contextlib import logging import math import operator +import os import tempfile import time from collections import defaultdict @@ -107,6 +109,35 @@ logger = logging.getLogger(__name__) +# Strategy enumeration fills each OpSpec's redistribute_cost via torch's +# generate_redistribute_costs (an expensive per-strategy redistribute-plan +# computation, the dominant cost of build on large/3D meshes). But +# _build_decision_vars overwrites every edge with the NCCL-aware +# estimate_strategy_comms_cost, and nothing reads the enumeration costs in +# between (remove_invalid_configs / keep_unique_configs select on placements/ +# shapes only). So during enumeration we replace torch's redistribute_cost with +# a structure-preserving dummy to skip the wasted work; the final decision_vars +# are byte-identical. Autoparallel's own cost model uses a separate +# redistribute_cost (collective_runtime_estimation) and is unaffected. Escape +# hatch for A/B verification: AP_FAST_BUILD=0. +_FAST_BUILD = os.environ.get("AP_FAST_BUILD", "1") == "1" + + +@contextlib.contextmanager +def _skip_enumeration_redistribute_cost(): + if not _FAST_BUILD: + yield + return + import torch.distributed.tensor._ops.utils as _dt_utils + + orig = _dt_utils.redistribute_cost + _dt_utils.redistribute_cost = lambda *args, **kwargs: 0.0 + try: + yield + finally: + _dt_utils.redistribute_cost = orig + + def concretize_symint(val): """Concretize a SymInt to a plain int, pass through other values. @@ -660,52 +691,58 @@ def _normalize_node(self, node): def build_sharding_metadata(self): strats = {} - for node in self.graph.nodes: - if node.op in ("placeholder", "get_attr"): - val = node.meta.get("val") - if isinstance(val, torch.Tensor): - strats[node] = _create_all_options(self.mesh, val.shape, tensor=val) - elif node.op == "placeholder": - # Non-tensor placeholders (e.g. baked-in booleans/strings): - # keep them in strats with empty-shape replicate options - # so the constraint system can reference them. - strats[node] = _create_all_options(self.mesh, ()) + # Enumeration's redistribute_cost matrices are overwritten with real + # costs in _build_decision_vars, so skip computing them here (see + # _skip_enumeration_redistribute_cost). + with _skip_enumeration_redistribute_cost(): + for node in self.graph.nodes: + if node.op in ("placeholder", "get_attr"): + val = node.meta.get("val") + if isinstance(val, torch.Tensor): + strats[node] = _create_all_options( + self.mesh, val.shape, tensor=val + ) + elif node.op == "placeholder": + # Non-tensor placeholders (e.g. baked-in booleans/strings): + # keep them in strats with empty-shape replicate options + # so the constraint system can reference them. + strats[node] = _create_all_options(self.mesh, ()) + else: + # Non-tensor get_attr: GraphModule submodules used by + # HOPs — not added to strats, invisible to the ILP. + # _all_input_nodes filters them. + assert node.op == "get_attr" + assert any( + isinstance(u.target, torch._ops.HigherOrderOperator) + or "local_map" in u.name + for u in node.users + ), f"Non-tensor get_attr {node} is not used by a HOP" + elif node.op == "call_function": + if not _produces_tensor(node.meta.get("val")): + # Shape-computation nodes (sym_size, operator.mul, etc.) + # produce scalars, not tensors — skip sharding. + continue + user_strats = tree_map_only( + torch.fx.Node, + lambda x: strats.get(x, x.meta.get("val")), + node.args, + ) + user_args = tree_map_only( + torch.fx.Node, lambda x: x.meta.get("val"), node.args + ) + user_kwargs = tree_map_only( + torch.fx.Node, lambda x: x.meta.get("val"), node.kwargs + ) + strats[node] = get_placement_options_for_node( + self.mesh, node, user_strats, user_args, user_kwargs + ) + elif node.op == "output": + user_strats = tree_map_only( + torch.fx.Node, lambda x: strats[x], node.args + ) + strats[node] = user_strats else: - # Non-tensor get_attr: GraphModule submodules used by - # HOPs — not added to strats, invisible to the ILP. - # _all_input_nodes filters them. - assert node.op == "get_attr" - assert any( - isinstance(u.target, torch._ops.HigherOrderOperator) - or "local_map" in u.name - for u in node.users - ), f"Non-tensor get_attr {node} is not used by a HOP" - elif node.op == "call_function": - if not _produces_tensor(node.meta.get("val")): - # Shape-computation nodes (sym_size, operator.mul, etc.) - # produce scalars, not tensors — skip sharding. - continue - user_strats = tree_map_only( - torch.fx.Node, - lambda x: strats.get(x, x.meta.get("val")), - node.args, - ) - user_args = tree_map_only( - torch.fx.Node, lambda x: x.meta.get("val"), node.args - ) - user_kwargs = tree_map_only( - torch.fx.Node, lambda x: x.meta.get("val"), node.kwargs - ) - strats[node] = get_placement_options_for_node( - self.mesh, node, user_strats, user_args, user_kwargs - ) - elif node.op == "output": - user_strats = tree_map_only( - torch.fx.Node, lambda x: strats[x], node.args - ) - strats[node] = user_strats - else: - raise ValueError(f"Unexpected node op: {node.op}") + raise ValueError(f"Unexpected node op: {node.op}") return strats def create_cluster_links(self, clusters): diff --git a/examples/_bench_build_profile.py b/examples/_bench_build_profile.py new file mode 100644 index 00000000..82b31bd6 --- /dev/null +++ b/examples/_bench_build_profile.py @@ -0,0 +1,80 @@ +"""Dump the lite-build phase breakdown (tracing vs strategy enumeration vs +decision-var cost estimation) for LLaMA3-1B on a 3D mesh, to see where the +~615s build time goes. Env: MESH, SEQLEN.""" +import json +import logging +import os +import time +from unittest.mock import patch + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] + + +def model_fn(): + args = TransformerModelArgs( + dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, + multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"=== build profile: mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True) + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") +autop.__enter__() +enter_s = time.perf_counter() - t +opt = autop.sharding_optimizer +tm = opt.profile["timings"] +init = tm.get("init_total_s", 0.0) +tracing = enter_s - init # __enter__ = tracing + ShardingOptimizer construction + +print(json.dumps({ + "enter_total_s": round(enter_s, 1), + "tracing_s (enter - optimizer_init)": round(tracing, 1), + "optimizer_init_total_s": round(init, 1), + " strategy_enumeration_s": round(tm.get("strategy_enumeration_s", 0), 1), + " decision_var_build_s": round(tm.get("decision_var_build_s", 0), 1), + " compute_cost_estimation_s": round(tm.get("compute_cost_estimation_s", 0), 1), + " edge_cost_estimation_s": round(tm.get("edge_cost_estimation_s", 0), 1), + " pulp_var_creation_s (0 in lite)": round(tm.get("pulp_var_creation_s", 0), 1), + " validation_s": round(tm.get("validation_s", 0), 1), + "decision_vars": len(opt.decision_vars), + "graph_nodes": opt.profile["model"]["graph_nodes"], + "strategy_options": opt.profile["strategies"]["strategy_options"], + "option_tuples (edges)": opt.profile["strategies"]["option_tuples"], +}, indent=2), flush=True) diff --git a/examples/_bench_build_verify.py b/examples/_bench_build_verify.py new file mode 100644 index 00000000..08fea734 --- /dev/null +++ b/examples/_bench_build_verify.py @@ -0,0 +1,92 @@ +"""A/B verify that the fast build (AP_FAST_BUILD=1) produces byte-identical +decision_vars + approx objective as the baseline (AP_FAST_BUILD=0), and report +build time. Run the same MESH/MODEL with both env values and diff the dv_hash. +Env: MESH, SEQLEN, MODEL (tiny|1b).""" +import hashlib +import logging +import os +import time +from unittest.mock import patch + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MODEL = os.environ.get("MODEL", "tiny") +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "4,2").split(",")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128 if MODEL == "tiny" else 128256 +batch_size = 2 * mesh.shape[0] + + +def model_fn(): + if MODEL == "tiny": + args = TransformerModelArgs(dim=64, n_layers=2, n_heads=4, n_kv_heads=2, + vocab_size=vocab_size, multiple_of=32, + rope_theta=500000, max_seq_len=SEQLEN) + else: + args = TransformerModelArgs(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, + ffn_dim_multiplier=1.5, multiple_of=256, + rope_theta=500000, vocab_size=vocab_size, + max_seq_len=SEQLEN) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +x = (Shard(0),) + (Replicate(),) * (ndim - 1) +out = (Shard(0), Shard(2)) if ndim == 2 else x + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") +autop.__enter__() +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x]) +autop.add_output_constraints([out]) +build_s = time.perf_counter() - t +opt = autop.sharding_optimizer + +# Canonical, exact dump of every decision var's costs. +items = [] +for key in sorted(opt.decision_vars.keys()): + dv = opt.decision_vars[key] + items.append((key, repr(dv.cost), repr(dv.comm_cost), repr(dv.compute_cost), + repr(dv.sharding_transition_cost))) +dv_hash = hashlib.sha256(repr(items).encode()).hexdigest() + +t = time.perf_counter() +ApproximateShardingSolver(opt).get_solution(verbose=False) +approx_s = time.perf_counter() - t +obj = opt.profile["approximate"]["objective"] + +print(f"AP_FAST_BUILD={os.environ.get('AP_FAST_BUILD', '1')} MODEL={MODEL} " + f"MESH={MESH_SHAPE} build={build_s:.2f}s approx={approx_s:.2f}s " + f"n_dv={len(opt.decision_vars)} dv_hash={dv_hash[:32]} " + f"approx_obj={obj!r}", flush=True) From c78555a7b71e01e21ea2c121527a8f017e5ac727 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 11:16:19 -0700 Subject: [PATCH 16/27] Store cluster_links node-level (drop per-option expansion) + DecisionVar slots create_cluster_links materialized one dict entry per (arg,out,inp) option-tuple per cluster copy (~120M entries, ~80s, huge memory on 3D), but the mapping is purely node-level (copy->root, identical option indices) and every consumer reduced it back to node level. Store cluster_links as {copy_node_idx: root_node_idx} and reconstruct option keys on demand (_cluster_root_key / _linked_option_keys / _root_to_copies). Serialization already used the node-level form on disk. Also @dataclass(slots=True) on DecisionVar (millions of instances). A/B verified byte-identical decision_vars + objective vs the prior commit (tiny + 1B/2D); all 50 cluster/serialization/approx/propagation tests pass. Authored with Claude. --- autoparallel/approximate_sharding.py | 27 +++---- autoparallel/optimize_sharding.py | 106 +++++++++++++++------------ autoparallel/serialization.py | 36 ++++----- 3 files changed, 86 insertions(+), 83 deletions(-) diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py index 146effcf..27b5ca07 100644 --- a/autoparallel/approximate_sharding.py +++ b/autoparallel/approximate_sharding.py @@ -275,7 +275,8 @@ def _solve(self, verbose: bool = False): # ------------------------------------------------------------------ # def _build_problem(self): opt = self.opt - cluster_linked = {key[0] for key in opt.cluster_links} + # cluster_links is node-level: copy node idx -> root node idx. + cluster_linked = set(opt.cluster_links) self.cost_bearing = [ opt.node_map[node] for node in opt.strats @@ -283,8 +284,8 @@ def _build_problem(self): ] root_to_copies: dict[int, set] = defaultdict(set) - for linked_key, root_key in opt.cluster_links.items(): - root_to_copies[root_key[0]].add(linked_key[0]) + for copy_idx, root_idx in opt.cluster_links.items(): + root_to_copies[root_idx].add(copy_idx) self.node_mult = { v: 1 + len(root_to_copies.get(v, ())) for v in self.cost_bearing } @@ -456,15 +457,13 @@ def _topology_direct(self): ) opt = self.opt - cl = opt.cluster_links + cl = opt.cluster_links # node-level: copy node idx -> root node idx def rootkey(k): - return cl.get(k, k) + return opt._cluster_root_key(k) - cluster_linked = {key[0] for key in cl} - node_root = {} - for lk, rk in cl.items(): - node_root[lk[0]] = rk[0] + cluster_linked = set(cl) + node_root = dict(cl) def nroot(idx): return node_root.get(idx, idx) @@ -638,7 +637,7 @@ def _axis_restrict_from_log(self): the pin was applied as a PuLP row ("constraint") or as variable bounds ("fix", which leaves no row to parse) and in the lite (no-PuLP) build.""" opt = self.opt - node_root = {lk[0]: rk[0] for lk, rk in opt.cluster_links.items()} + node_root = dict(opt.cluster_links) # node-level: copy idx -> root idx restrict: dict[int, set] = {} for fname, kwargs in getattr(opt, "_constraint_log", []): if fname != "add_node_axis_constraint": @@ -698,10 +697,8 @@ def _build_groups(self, paired_edges, flow_couplings): opt = self.opt n = len(opt.nodes) uf = _UnionFind(n) - # cluster_links has one entry per option-key; collapse to unique - # (linked_node, root_node) pairs so the K-scaled loops below run over - # hundreds of pairs, not millions of duplicates. - cluster_pairs = {(lk[0], rk[0]) for lk, rk in opt.cluster_links.items()} + # cluster_links is node-level: (copy node idx, root node idx) pairs. + cluster_pairs = set(opt.cluster_links.items()) for li, ri in cluster_pairs: uf.union(li, ri) for a, b, _ in paired_edges: @@ -1321,7 +1318,7 @@ def _write_back(self): selected.append(key) opt.selected_keys = list(selected) for rk in selected: - opt.selected_keys.extend(opt._root_to_linked.get(rk, [])) + opt.selected_keys.extend(opt._linked_option_keys(rk)) # Populate prob.objective (when a PuLP problem exists) so callers can also # score via pulp.value(prob.objective); the returned value uses the # equivalent but cheaper total_objective(). In the lite (no-PuLP) build, diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 7b648946..65a9121c 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -219,10 +219,13 @@ def concretize_gm(gm): return concrete_gm, orig_to_concrete, concrete_to_orig -@dataclass +@dataclass(slots=True) class DecisionVar: """A decision variable in the ILP, representing one (node, arg, output_placement, - input_placement) choice with its associated costs and strategy metadata.""" + input_placement) choice with its associated costs and strategy metadata. + + slots=True: there are millions of these on large/3D meshes, so dropping the + per-instance __dict__ materially cuts both build time and memory.""" var: Any # pulp.LpVariable cost: float @@ -397,7 +400,11 @@ def __init__( get_placement_options_timer().report() - self.cluster_links: dict[tuple, tuple] = {} + # Node-level: cluster-copy node idx -> root node idx (option indices are + # identical between copy and root; resolved on demand via + # _cluster_root_key / _linked_option_keys). + self.cluster_links: dict[int, int] = {} + self._root_to_copies: dict[int, list[int]] = defaultdict(list) if self.solver_backend == "dp": t0 = time.perf_counter() self.solver = DPBasedShardingSolver(self) @@ -746,28 +753,40 @@ def build_sharding_metadata(self): return strats def create_cluster_links(self, clusters): - """Create a mapping between identical optimization nodes to reduce the - optimization space. If cluster_links[key1] == key2, the optimization - problem uses key2's variable in place of key1.""" + """Map each cluster-copy node to its root node (node-level). The optimizer + reuses the root's decision variable for every copy, and the per-(arg, out, + inp) option index is identical between a copy and its root, so we store + only the node->node map and reconstruct option keys on demand (see + _cluster_root_key / _linked_option_keys). Materializing one dict entry per + option-tuple instead costs tens of millions of entries (and seconds of + build time) on large/3D meshes.""" for cluster_group in clusters: cluster0 = cluster_group[0] for cluster_i in cluster_group[1:]: for n0, ni in zip(cluster0, cluster_i): - idx0 = self.node_map[n0] - idx1 = self.node_map[ni] - options_n0 = list(self.walk_over_options(n0)) - options_ni = list(self.walk_over_options(ni)) - assert options_n0 == options_ni, ( - f"Problem with graph clustering: {n0} and {ni} don't have the same number " - "of input/output placements. Please report a bug" + assert len(self.strats[n0].strategies) == len( + self.strats[ni].strategies + ), ( + f"Problem with graph clustering: {n0} and {ni} don't have " + "the same number of strategies. Please report a bug" ) - for argi, out_idx, inp_idx in options_n0: - self.cluster_links[(idx1, argi, out_idx, inp_idx)] = ( - idx0, - argi, - out_idx, - inp_idx, - ) + self.cluster_links[self.node_map[ni]] = self.node_map[n0] + + def _cluster_root_key(self, key): + """Resolve an option key to its cluster-root option key, or return it + unchanged when the node is not a cluster copy. The (arg, out, inp) indices + are identical between a copy and its root.""" + root_idx = self.cluster_links.get(key[0]) + return key if root_idx is None else (root_idx, key[1], key[2], key[3]) + + def _linked_option_keys(self, root_key): + """The option keys on the cluster copies of root_key's node (each a mirror + of root_key with the copy's node index). A copy mirrors its root's + per-option validity, so callers pass valid root keys only.""" + copies = self._root_to_copies.get(root_key[0]) + if not copies: + return () + return [(c, root_key[1], root_key[2], root_key[3]) for c in copies] def _all_input_nodes(self, node): """Variant of node.all_input_nodes that preserves duplicate nodes. @@ -820,7 +839,7 @@ def _create_pulp_variables(self, variable_category=pulp.LpBinary): f"Unsupported variable_category={variable_category!r}; " "expected pulp.LpBinary or pulp.LpContinuous" ) - cluster_linked_node_idxs = {key[0] for key in self.cluster_links} + cluster_linked_node_idxs = set(self.cluster_links) pulp_variables = {} for node, _ in self.strats.items(): @@ -854,7 +873,7 @@ def _get_pulp_variable(self, key): Returns None if the key was pruned (invalid/infinite-cost strategy). """ - root_key = self.cluster_links.get(key, key) + root_key = self._cluster_root_key(key) return self.pulp_variables.get(root_key) def _compute_edge_costs( @@ -909,7 +928,7 @@ def _build_decision_vars(self): """ # Precompute which node indices are cluster-linked so we can # copy costs from the root instead of recomputing them. - self._cluster_linked_node_idxs = {key[0] for key in self.cluster_links} + self._cluster_linked_node_idxs = set(self.cluster_links) t_compute = 0.0 t_edge = 0.0 @@ -1002,10 +1021,7 @@ def _build_decision_vars(self): # The root pass above updated redistribute_cost in place with # edge-computed costs; linked strats need the same values for # _compute_solution_cost and other readers. - linked_node_to_root_node: dict[int, int] = {} - for linked_key, root_key in self.cluster_links.items(): - linked_node_to_root_node[linked_key[0]] = root_key[0] - for linked_node_idx, root_node_idx in linked_node_to_root_node.items(): + for linked_node_idx, root_node_idx in self.cluster_links.items(): linked_node = self.nodes[linked_node_idx] root_node = self.nodes[root_node_idx] linked_op = self.strats[linked_node] @@ -1018,12 +1034,12 @@ def _build_decision_vars(self): ] n_cluster_copied = len(self.cluster_links) - # Linked keys mirror their root's validity (redistribute_cost is copied - # from the root above), so only valid root keys map to linked keys. - self._root_to_linked: dict[tuple, list[tuple]] = defaultdict(list) - for linked_key, root_key in self.cluster_links.items(): - if root_key in self._valid_keys: - self._root_to_linked[root_key].append(linked_key) + # Root node idx -> [copy node idxs]. Option keys are reconstructed on + # demand (see _linked_option_keys); a copy mirrors its root's per-option + # validity, so no per-option filtering is needed here. + self._root_to_copies = defaultdict(list) + for copy_idx, root_idx in self.cluster_links.items(): + self._root_to_copies[root_idx].append(copy_idx) t_pulp_end = time.perf_counter() logger.debug( @@ -1060,7 +1076,7 @@ def _resolve_decision_var(self, key): dv = self.decision_vars.get(key) if dv is not None: return dv - root_key = self.cluster_links[key] + root_key = self._cluster_root_key(key) root_dv = self.decision_vars[root_key] node_idx, argi, out_idx, _ = key strategy = self.strats[self.nodes[node_idx]].strategies[out_idx] @@ -1088,8 +1104,10 @@ def _find_decision_var(self, node_idx, argi, out_idx): key = (node_idx, argi, out_idx, inp_idx) if key in self.decision_vars: return self._resolve_decision_var(key) - root_key = self.cluster_links.get(key) - if root_key is not None and root_key in self.decision_vars: + if ( + key[0] in self.cluster_links + and self._cluster_root_key(key) in self.decision_vars + ): return self._resolve_decision_var(key) return None @@ -1105,10 +1123,10 @@ def _collect_vars(self, node, node_idx, argi, group_by, resolve_clusters=False): result = {} for _, out_idx, inp_idx in self.walk_over_options(node, argi): key = (node_idx, argi, out_idx, inp_idx) - if key in self.cluster_links: + if key[0] in self.cluster_links: if not resolve_clusters: continue - var = self.pulp_variables.get(self.cluster_links[key]) + var = self.pulp_variables.get(self._cluster_root_key(key)) else: var = self.pulp_variables.get(key) if var is None: # pruned (invalid/infinite-cost) strategy edge @@ -1389,7 +1407,7 @@ def _set_objective(self): return terms = [] for key, dv in self.decision_vars.items(): - multiplier = 1 + len(self._root_to_linked.get(key, [])) + multiplier = 1 + len(self._root_to_copies.get(key[0], ())) terms.append(dv.var * dv.cost * multiplier) self.prob += pulp.lpSum(terms) @@ -1513,7 +1531,7 @@ def _solve(self, verbose=False): key for key, dv in self.decision_vars.items() if dv.var.value() == 1 ] for root_key in list(self.selected_keys): - self.selected_keys.extend(self._root_to_linked.get(root_key, [])) + self.selected_keys.extend(self._linked_option_keys(root_key)) if self.prob.status == -1: logger.warning(self.get_violated_constraints_log()) @@ -1638,7 +1656,7 @@ def solve_lp_relaxation(self, verbose=False, frac_tol=1e-6, extract=False): if dv.var.value() is not None and dv.var.value() > 0.5 ] for root_key in list(self.selected_keys): - self.selected_keys.extend(self._root_to_linked.get(root_key, [])) + self.selected_keys.extend(self._linked_option_keys(root_key)) solution = self._to_orig_solution(self._extract_and_validate_solution()) finally: for v, cat in zip(variables, original_cats): @@ -1932,10 +1950,8 @@ def get_json(self): # Build node-level cluster mapping: linked_node -> root_node cluster_roots: dict[torch.fx.Node, torch.fx.Node] = {} - for linked_key, root_key in self.cluster_links.items(): - linked_node = self.nodes[linked_key[0]] - root_node = self.nodes[root_key[0]] - cluster_roots[linked_node] = root_node + for copy_idx, root_idx in self.cluster_links.items(): + cluster_roots[self.nodes[copy_idx]] = self.nodes[root_idx] _normalize_cluster_layer(cluster_roots) diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py index dfd953b1..4c9167d4 100644 --- a/autoparallel/serialization.py +++ b/autoparallel/serialization.py @@ -135,7 +135,7 @@ def save_optimizer(opt, path): # Re-key strats by node name, saving only root nodes (non-linked). # Linked nodes share identical strats with their root and are # reconstructed on load from cluster_links. - linked_node_names = {opt.nodes[lk[0]].name for lk in opt.cluster_links} + linked_node_names = {opt.nodes[c].name for c in opt.cluster_links} strats_by_name = { node.name: strat for node, strat in opt.strats.items() @@ -193,8 +193,8 @@ def save_optimizer(opt, path): "dv_costs_keys": dv_costs_keys, "dv_costs_vals": dv_costs_vals, "cluster_links_node_by_name": { - opt.nodes[lk[0]].name: opt.nodes[rk[0]].name - for lk, rk in opt.cluster_links.items() + opt.nodes[c].name: opt.nodes[r].name + for c, r in opt.cluster_links.items() }, "constraint_log": opt._constraint_log, "selected_keys_by_name": selected_keys_by_name, @@ -272,22 +272,12 @@ def load_optimizer(cls, path): opt.build_pulp = True opt.profile = {"timings": {}} - # Reconstruct cluster_links by expanding the node-level mapping over - # all (argi, out_idx, inp_idx) combinations. - opt.cluster_links = {} - for linked_name, root_name in cluster_links_node_by_name.items(): - linked_node = nodes_by_name[linked_name] - root_node = nodes_by_name[root_name] - linked_idx = opt.node_map[linked_node] - root_idx = opt.node_map[root_node] - for argi, out_idx, inp_idx in opt.walk_over_options(linked_node): - opt.cluster_links[(linked_idx, argi, out_idx, inp_idx)] = ( - root_idx, - argi, - out_idx, - inp_idx, - ) - opt._cluster_linked_node_idxs = {key[0] for key in opt.cluster_links} + # cluster_links is node-level: copy node idx -> root node idx. + opt.cluster_links = { + opt.node_map[nodes_by_name[linked_name]]: opt.node_map[nodes_by_name[root_name]] + for linked_name, root_name in cluster_links_node_by_name.items() + } + opt._cluster_linked_node_idxs = set(opt.cluster_links) # Mesh placeholder — provides shape/dim_names for get_json() and ndim # for add_node_constraint() default placement, without needing a PG @@ -344,9 +334,9 @@ def load_optimizer(cls, path): len(opt.decision_vars), ) - opt._root_to_linked = defaultdict(list) - for linked_key, root_key in opt.cluster_links.items(): - opt._root_to_linked[root_key].append(linked_key) + opt._root_to_copies = defaultdict(list) + for copy_idx, root_idx in opt.cluster_links.items(): + opt._root_to_copies[root_idx].append(copy_idx) opt.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize) opt.add_default_constraints() @@ -399,7 +389,7 @@ def _restore_solution(opt, selected_keys_by_name, nodes_by_name): # Expand cluster links for root_key in list(opt.selected_keys): - opt.selected_keys.extend(opt._root_to_linked.get(root_key, [])) + opt.selected_keys.extend(opt._linked_option_keys(root_key)) def save_placements(opt, path): From f493ab85ad70ec364c8cfead4ec572b59506a7ec Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 11:30:54 -0700 Subject: [PATCH 17/27] Parallelize decision-var cost computation across forked workers decision_var_build (estimate_strategy_comms_cost over millions of edges) is the last build bottleneck and is per-node independent. Split _build_decision_vars into Phase A (compute per-edge costs, fork-parallel) + Phase B (assemble DecisionVars / PuLP vars, serial). Workers read the optimizer from the fork-inherited address space (no pickling of the mesh / strategy graph) and return only primitive cost tuples; the deterministic computation makes the result byte-identical to serial. Workers fork before any PuLP object exists. Cumulative build result on LLaMA3-1B 3D (2,4,8): 777s -> 62s (12.5x), now comparable to the ~50s approximate solve. A/B byte-identical (tiny + 1B/2D); 3D end-to-end objective unchanged (50222.7); all 50 build/approx/serialization/ propagation tests pass. Serial fallback via AP_PARALLEL_BUILD=1. Authored with Claude. --- autoparallel/optimize_sharding.py | 146 ++++++++++++++++++++++-------- 1 file changed, 107 insertions(+), 39 deletions(-) diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 65a9121c..1620be87 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -138,6 +138,61 @@ def _skip_enumeration_redistribute_cost(): _dt_utils.redistribute_cost = orig +# Number of fork workers for the per-edge cost computation in _build_decision_vars +# (the dominant cost of build on large/3D meshes). 1 = serial (use for A/B +# verification); default scales with cores. The computation is per-node +# independent and deterministic, so the parallel result is byte-identical. +_PARALLEL_BUILD_WORKERS = int( + os.environ.get("AP_PARALLEL_BUILD", str(min(32, (os.cpu_count() or 1)))) +) + +# Set to the optimizer before forking cost workers; the workers read it from the +# fork-inherited address space (no pickling of the mesh / strategy graph). +_FORK_OPT: "ShardingOptimizer | None" = None + + +def _par_node_edge_costs(node_idx): + """Worker: compute the per-edge (comm, transition) costs and the per-strategy + compute cost for one root node, reading the fork-inherited optimizer. Pure — + it reads strats and mutates nothing; the parent assembles DecisionVars from + these primitives. Returns (node_idx, out_data) where + out_data[out_idx] = (per_arg_compute, arg_rows) and + arg_rows[argi][inp_idx] = (comm_cost, transition_cost).""" + opt = _FORK_OPT + node = opt.nodes[node_idx] + op_strategy = opt.strats[node] + num_args = len(op_strategy.strategies[0].input_specs) + all_input_nodes = opt._all_input_nodes(node) + producer_strategies = [opt.strats[n] for n in all_input_nodes] + out_data = [] + for output_strategy in op_strategy.strategies: + per_arg_compute = ( + estimate_strategy_runtime_cost(node, output_strategy) / num_args + ) + arg_rows = [] + for argi, redist_costs in enumerate(output_strategy.redistribute_cost): + producer_strategy = ( + producer_strategies[argi] + if argi < len(producer_strategies) + else None + ) + arg_rows.append( + [ + opt._compute_edge_costs( + node, + output_strategy, + argi, + inp_idx, + default_comm_cost, + producer_strategy, + ) + for inp_idx, default_comm_cost in enumerate(redist_costs) + ] + ) + out_data.append((per_arg_compute, arg_rows)) + return node_idx, out_data + + def concretize_symint(val): """Concretize a SymInt to a plain int, pass through other values. @@ -944,47 +999,35 @@ def _build_decision_vars(self): (self.node_map[node], node, strat) for node, strat in self.strats.items() ] - # Build DVs for root nodes only (not cluster-linked). Compute the edge - # cost first and only materialize a variable when it is finite. - for node_idx, node, op_strategy in strats_items: - if node.op == "output": - continue - if node_idx in self._cluster_linked_node_idxs: - continue - - num_args = len(op_strategy.strategies[0].input_specs) - # Hoisted out of the per-(out_idx, argi, inp_idx) loops: these depend - # only on the node, not on the strategy choice. Recomputing them per - # decision var was O(#vars) calls to _all_input_nodes (a tree_flatten - # each), which dominated build time on large/3D meshes. - all_input_nodes = self._all_input_nodes(node) - producer_strategies = [self.strats[n] for n in all_input_nodes] - - for out_idx, output_strategy in enumerate(op_strategy.strategies): - tc0 = time.perf_counter() - compute_cost = estimate_strategy_runtime_cost(node, output_strategy) - t_compute += time.perf_counter() - tc0 - per_arg_compute = compute_cost / num_args - - te0 = time.perf_counter() + # Phase A: compute every root node's per-edge costs. This (the comm-cost + # estimate over millions of edges) dominates build, is per-node + # independent, and mutates nothing, so it runs across forked workers. + root_idxs = [ + node_idx + for node_idx, node, _ in strats_items + if node.op != "output" and node_idx not in self._cluster_linked_node_idxs + ] + tc0 = time.perf_counter() + node_results = self._compute_node_edge_costs(root_idxs) + t_edge = time.perf_counter() - tc0 + + # Phase B: assemble decision vars (and PuLP variables) from the computed + # costs. Serial because PuLP vars and DecisionVars hold parent-owned + # strategy objects; byte-identical to computing the costs inline. This + # also writes the real costs back into each strat's redistribute_cost + # (overwriting the enumeration dummies) for the cluster batch-copy and + # _compute_solution_cost readers below. + for node_idx, out_data in node_results: + node = self.nodes[node_idx] + op_strategy = self.strats[node] + for out_idx, (per_arg_compute, arg_rows) in enumerate(out_data): + output_strategy = op_strategy.strategies[out_idx] for argi, redist_costs in enumerate(output_strategy.redistribute_cost): - producer_strategy = ( - producer_strategies[argi] - if argi < len(producer_strategies) - else None - ) input_spec = output_strategy.input_specs[argi] - for inp_idx, default_comm_cost in enumerate(redist_costs): - comm_cost, transition_cost = self._compute_edge_costs( - node, - output_strategy, - argi, - inp_idx, - default_comm_cost, - producer_strategy, - ) + for inp_idx, (comm_cost, transition_cost) in enumerate( + arg_rows[argi] + ): redist_costs[inp_idx] = comm_cost - cost = comm_cost + per_arg_compute + transition_cost # Prune invalid (infinite-cost) edges: no variable, no # DecisionVar. A key absent from decision_vars is treated @@ -1015,7 +1058,6 @@ def _build_decision_vars(self): input_spec=input_spec, ) n_vars += 1 - t_edge += time.perf_counter() - te0 # Batch-copy redistribute_cost from root strats to linked strats. # The root pass above updated redistribute_cost in place with @@ -1071,6 +1113,32 @@ def _build_decision_vars(self): ) return decision_vars + def _compute_node_edge_costs(self, root_idxs): + """Phase A of _build_decision_vars: per-root-node edge costs. Parallel + across forked workers when enabled; workers read this optimizer from the + fork-inherited address space (no pickling of the mesh / strategy graph) + and return only primitive cost tuples. The computation is deterministic, + so the parallel result is byte-identical to the serial path.""" + global _FORK_OPT + _FORK_OPT = self + try: + if _PARALLEL_BUILD_WORKERS <= 1 or len(root_idxs) < 64: + return [_par_node_edge_costs(ni) for ni in root_idxs] + import multiprocessing as mp + + ctx = mp.get_context("fork") + with ctx.Pool(_PARALLEL_BUILD_WORKERS) as pool: + # imap (ordered), not imap_unordered: results come back in + # root_idxs order so decision_vars is assembled in the same node + # order as the serial path. This keeps the PuLP objective's + # lpSum term order identical too, so even the ILP path is + # bit-for-bit unchanged (float addition is not associative). + return list( + pool.imap(_par_node_edge_costs, root_idxs, chunksize=4) + ) + finally: + _FORK_OPT = None + def _resolve_decision_var(self, key): """Return a DecisionVar for key, reconstructing on the fly for linked keys.""" dv = self.decision_vars.get(key) From 496e7b33dc5f1c74958b4cdc2f676527c579c304 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 15:53:42 -0700 Subject: [PATCH 18/27] Add cross-size prune+dp benchmark (latency + LP-relaxation accuracy) examples/_bench_sizes.py runs the prune+dp approximate search across LLaMA3 1B/3B/8B/70B on a configurable mesh, reporting end-to-end latency (lite build + approx solve) and an accuracy reference: the gap of the approximate objective against a HiGHS LP-relaxation lower bound (the sharding LP is integral, so the bound equals the exact ILP optimum). Controlled via MODEL/MESH/SEQLEN/ACCURACY/ LP_METHOD env vars. Authored with Claude. --- examples/_bench_sizes.py | 166 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 examples/_bench_sizes.py diff --git a/examples/_bench_sizes.py b/examples/_bench_sizes.py new file mode 100644 index 00000000..46962209 --- /dev/null +++ b/examples/_bench_sizes.py @@ -0,0 +1,166 @@ +"""e2e prune+dp (approx) search across LLaMA3 sizes: latency + accuracy. + +For one MODEL on one MESH: + * latency: lite build (build_pulp=False) + ApproximateShardingSolver -> the + production prune+dp path (build_s, approx_s, total, objective). + * accuracy: a separate full PuLP build -> HiGHS LP-relaxation lower bound + (this sharding LP is integral, so the bound equals the exact ILP optimum); + gap = (approx_obj - lb) / lb. + +Env: MODEL (1b|3b|8b|70b), MESH (e.g. 2,4,8), SEQLEN. One model per process. +""" +import gc +import logging +import os +import time +from unittest.mock import patch + +import numpy as np +import pulp +import scipy.sparse as sp +import torch +from scipy.optimize import linprog +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "1b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] + +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs( + rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL] + ) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +def constrain(autop): + x = (Shard(0),) + (Replicate(),) * (ndim - 1) + out = (Shard(0), Shard(2)) if ndim == 2 else x + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x]) + autop.add_output_constraints([out]) + + +def lp_lower_bound_highs(opt): + """LP relaxation (binaries -> [0,1]) of the built problem, solved with HiGHS. + Objective is read from decision_vars and constraints from prob.constraints + using id()-keyed indexing (avoids hashing the long PuLP var names).""" + opt._set_objective() + opt._apply_memory_constraint() + variables = opt.prob.variables() + vidx = {id(v): i for i, v in enumerate(variables)} + n = len(variables) + c = np.zeros(n) + for key, dv in opt.decision_vars.items(): + mult = 1 + len(opt._root_to_copies.get(key[0], ())) + c[vidx[id(dv.var)]] += dv.cost * mult + re = ru = 0 + reqr, reqc, reqd, beq = [], [], [], [] + rubr, rubc, rubd, bub = [], [], [], [] + for con in opt.prob.constraints.values(): + rhs = -con.constant + if con.sense == pulp.LpConstraintEQ: + for v, co in con.items(): + reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co) + beq.append(rhs); re += 1 + else: + s = 1.0 if con.sense == pulp.LpConstraintLE else -1.0 + for v, co in con.items(): + rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(s * co) + bub.append(s * rhs); ru += 1 + A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None + A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None + # Dual simplex: far faster than the barrier (IPM) on this near-integral, + # network-flow-like LP. We only need the optimal objective as the bound. + method = os.environ.get("LP_METHOD", "highs-ds") + res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None), + bounds=(0, 1), method=method, options={"disp": True}) + if not res.success: + raise RuntimeError(f"HiGHS failed: {res.message}") + return res.fun, n, re + ru + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ###", flush=True) + +# ---- latency: lite build + prune+dp approx (production path) ---- +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") +autop.__enter__() +constrain(autop) +build_lite = time.perf_counter() - t +opt = autop.sharding_optimizer +n_dv = len(opt.decision_vars) +params = opt.profile["model"]["parameter_numel"] +t = time.perf_counter() +ApproximateShardingSolver(opt).get_solution(verbose=False) +approx_s = time.perf_counter() - t +obj = opt.profile["approximate"]["objective"] +print(f"[latency] params={params/1e9:.2f}B lite_build={build_lite:.1f}s " + f"approx={approx_s:.1f}s total={build_lite + approx_s:.1f}s " + f"decision_vars={n_dv} obj={obj:.1f}", flush=True) +autop.__exit__(None, None, None) +del autop, opt +gc.collect() + +if os.environ.get("ACCURACY", "1") != "1": + print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B " + f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s " + f"total={build_lite+approx_s:.1f}s obj={obj:.1f} (LP skipped)", flush=True) + raise SystemExit(0) + +# ---- accuracy: full build + HiGHS LP lower bound ---- +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") +autop.__enter__() +constrain(autop) +full_build = time.perf_counter() - t +opt = autop.sharding_optimizer +t = time.perf_counter() +lb, nvar, ncon = lp_lower_bound_highs(opt) +lp_s = time.perf_counter() - t +gap = 100 * (obj - lb) / lb +print(f"[accuracy] full_build={full_build:.1f}s lp_solve={lp_s:.1f}s " + f"lower_bound={lb:.1f} vars={nvar} cons={ncon}", flush=True) +print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B " + f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s total={build_lite+approx_s:.1f}s " + f"obj={obj:.1f} LP_lb={lb:.1f} gap={gap:+.2f}%", flush=True) From 2f06359c3d0e51c3092adc996122a829bfb32c2e Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 21:18:37 -0700 Subject: [PATCH 19/27] Approx solver: replace loopy min-sum BP with TRW-S; skip-clustered build work The approximate solver's min-sum belief propagation settled into globally inconsistent fixed points on the sharding MRF (the undirected factor graph is loopy: residual and multi-branch reconvergence give ~129 cycles after clustering), leaving the objective 5-16% above the optimum on 2D and up to 12% on 3D. The factor graph and objective are faithful and the optimum is representable (verified against an exact CBC solve on 2D and an integral LP on 3D), so this was purely a solver failure. _belief_propagation now runs tree-reweighted sequential message passing (TRW-S): a node ordering induces monotonic chains, each node is reweighted by 1/max(in,out)-degree, and forward/backward half-sweeps send min-sum messages only along the pass direction. On this integral problem TRW-S converges to the exact MAP: the bare approx (no annotation) drops to +0.00% on 2D (1B/3B/8B/70B, matching CBC) and +0.08-0.82% on 3D, ~20-30x faster than solving the LP. The decoded energy converges in long irregular plateaus, so a fixed sweep budget (time-bounded) is used rather than an early-stop heuristic; the now-dominated greedy second candidate is dropped. Two algorithm-preserving build speedups also land here: validate() skips cluster-copy nodes (the root covers them), and graph clustering memoizes each node's op-strategy string instead of rebuilding it per consumer. Authored with Claude. --- autoparallel/approximate_sharding.py | 127 +++++++++--------- autoparallel/graph_passes/graph_clustering.py | 18 +-- autoparallel/optimize_sharding.py | 5 + 3 files changed, 79 insertions(+), 71 deletions(-) diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py index 27b5ca07..5361a8a0 100644 --- a/autoparallel/approximate_sharding.py +++ b/autoparallel/approximate_sharding.py @@ -121,8 +121,7 @@ def __init__( self, optimizer, candidate_limit: Optional[int] = 64, - bp_iters: int = 20, - bp_damping: float = 0.2, + bp_iters: int = 400, bp_tol: float = 1e-3, max_sweeps: int = 12, max_time_s: float = 60.0, @@ -133,7 +132,6 @@ def __init__( self.opt = optimizer self.candidate_limit = candidate_limit self.bp_iters = bp_iters - self.bp_damping = bp_damping self.bp_tol = bp_tol self.max_sweeps = max_sweeps self.max_time_s = max_time_s @@ -191,12 +189,15 @@ def _solve(self, verbose: bool = False): ) deadline = t0 + self.max_time_s - # Candidate 1: belief propagation init. + # TRW-S init, then local-search polish. TRW-S reaches the exact MAP on the + # (integral) sharding problem, so the old greedy second candidate it used + # to be compared against is strictly dominated and has been dropped; the + # polish remains for the memory budget and as a local-search safety net. t_bp0 = time.perf_counter() - self._belief_propagation() + self._belief_propagation(deadline) if verbose: - logger.info("approx phase: bp converged iter=%s delta=%.4g in %.2fs; " - "bp_decode energy=%.1f", + logger.info("approx phase: trws iter=%s delta=%.4g in %.2fs; " + "decode energy=%.1f", getattr(self, "_bp_last_iter", None), getattr(self, "_bp_last_delta", float("nan")), time.perf_counter() - t_bp0, @@ -204,25 +205,11 @@ def _solve(self, verbose: bool = False): self._memory_repair() self._coordinate_descent(deadline) if verbose: - logger.info("approx phase: bp+cd energy=%.1f", self._fast_total_energy()) + logger.info("approx phase: trws+cd energy=%.1f", self._fast_total_energy()) self._star_block_search(deadline) bp_energy = self._fast_total_energy() - bp_snapshot = [g.current for g in self.groups] if verbose: - logger.info("approx phase: bp+cd+star energy=%.1f", bp_energy) - - # Candidate 2: greedy init (cheap insurance against BP doing poorly). - self._greedy_init() - self._memory_repair() - self._coordinate_descent(deadline) - self._star_block_search(deadline) - greedy_energy = self._fast_total_energy() - if verbose: - logger.info("approx phase: greedy+cd+star energy=%.1f", greedy_energy) - - if bp_energy <= greedy_energy: - for gid, ci in enumerate(bp_snapshot): - self._set_group(gid, ci) + logger.info("approx phase: trws+cd+star energy=%.1f", bp_energy) t_solve = time.perf_counter() - t0 - t_build objective = self._write_back() @@ -240,12 +227,11 @@ def _solve(self, verbose: bool = False): ) logger.info( "ApproximateShardingSolver: status=%s objective=%.4f " - "(bp=%.1f greedy=%.1f) groups=%d nodes=%d " + "(trws+polish=%.1f) groups=%d nodes=%d " "timings={build=%.3fs,solve=%.3fs,total=%.3fs}", status, objective, bp_energy, - greedy_energy, len(self.groups), len(self.cost_bearing), t_build, @@ -260,7 +246,6 @@ def _solve(self, verbose: bool = False): "total_s": total_s, "groups": len(self.groups), "bp_energy": bp_energy, - "greedy_energy": greedy_energy, } if infeasible: raise RuntimeError( @@ -1028,46 +1013,80 @@ def _fast_total_energy(self): # ------------------------------------------------------------------ # # Belief propagation (min-sum) + decode # ------------------------------------------------------------------ # - def _belief_propagation(self): - """Sequential (forward-backward, topological) min-sum message passing. - Exact MAP on trees in one sweep; near-optimal on the near-tree transformer - graph in a few sweeps, far better than synchronous flooding.""" + def _belief_propagation(self, deadline=None): + """Sequential tree-reweighted message passing (TRW-S). + + Plain loopy min-sum BP settles into globally-inconsistent fixed points on + this MRF (empirically 5-16% above the optimum). TRW-S optimizes a convex + upper bound over a tree decomposition (here: monotonic chains induced by a + node ordering), so on the integral sharding problem it converges to the + exact MAP. Node g is reweighted by 1/(chains through g) = 1/max(in,out)deg + under the ordering; forward and backward half-sweeps send only along edges + oriented with the pass. We decode each sweep and keep the best assignment.""" G = len(self.groups) + if G == 0: + return unary = self.g_unary nbrs = self.nbrs - damp = self.bp_damping order = sorted(range(G), key=lambda g: min(self.groups[g].members)) + pos = [0] * G + for i, g in enumerate(order): + pos[g] = i + gamma = np.ones(G) + for g in range(G): + indeg = sum(1 for h in nbrs[g] if pos[h] < pos[g]) + outdeg = sum(1 for h in nbrs[g] if pos[h] > pos[g]) + gamma[g] = 1.0 / max(indeg, outdeg, 1) + msg: dict[tuple, np.ndarray] = {} for g in range(G): for h in nbrs[g]: msg[(g, h)] = np.zeros(len(unary[h])) + # We decode every sweep and keep the best assignment. The decoded energy + # converges in long, irregular plateaus (it can sit at a high value for + # ~100 sweeps, drop, plateau again, then drop to the optimum), so neither + # an energy-plateau counter nor a message-delta threshold detects true + # convergence without stopping on a false plateau. We therefore run a + # fixed sweep budget (bounded by the time deadline), which is enough for + # the slowest converger observed, and an exact fixed point ends early. + best_e = INF + best_snap = None for sweep in range(self.bp_iters): max_delta = 0.0 - for direction in (order, order[::-1]): - for g in direction: + for forward in (True, False): + for g in order if forward else order[::-1]: if not nbrs[g]: continue - in_sum = unary[g].copy() - for k in nbrs[g]: - in_sum += msg[(k, g)] + wp = unary[g].copy() + for r in nbrs[g]: + wp += msg[(r, g)] + wp *= gamma[g] for h in nbrs[g]: - excl = in_sum - msg[(h, g)] + if (pos[h] > pos[g]) != forward: + continue P = self._pair_matrix(g, h) # (D_g, D_h) - m = (excl[:, None] + P).min(axis=0) + m = ((wp - msg[(h, g)])[:, None] + P).min(axis=0) m -= m.min() - md = (1 - damp) * m + damp * msg[(g, h)] - delta = np.abs(md - msg[(g, h)]).max() - if delta > max_delta: - max_delta = delta - msg[(g, h)] = md + d = np.abs(m - msg[(g, h)]).max() + if d > max_delta: + max_delta = d + msg[(g, h)] = m + self._decode(msg) + e = self._fast_total_energy() + if e < best_e: + best_e, best_snap = e, [grp.current for grp in self.groups] self._bp_last_iter = sweep + 1 self._bp_last_delta = max_delta - if max_delta < self.bp_tol: + if max_delta == 0.0 or ( + deadline is not None and time.perf_counter() > deadline + ): break - self._decode(msg) + if best_snap is not None: + for gid, ci in enumerate(best_snap): + self._set_group(gid, ci) def _decode(self, msg): """Sequential topological decode: fix each group to the argmin of its @@ -1097,24 +1116,6 @@ def _set_group(self, gid, ci): for m, o in group.choices[ci].items(): self.cur_out[m] = o - def _greedy_init(self): - order = sorted(range(len(self.groups)), - key=lambda g: min(self.groups[g].members)) - for gid in order: - self._set_group(gid, 0) - for gid in order: - best_i, best_e = 0, INF - for ci in range(self.groups[gid].domain): - e = self.g_unary[gid][ci] - for h in self.nbrs[gid]: - if min(self.groups[h].members) < min(self.groups[gid].members): - ch = self.groups[h].current - e += (self.C[(gid, h)][ci, ch] if gid < h - else self.C[(h, gid)][ch, ci]) - if e < best_e: - best_i, best_e = ci, e - self._set_group(gid, best_i) - def _coordinate_descent(self, deadline): for _ in range(self.max_sweeps): if time.perf_counter() > deadline: diff --git a/autoparallel/graph_passes/graph_clustering.py b/autoparallel/graph_passes/graph_clustering.py index c01a09a3..a8efafea 100644 --- a/autoparallel/graph_passes/graph_clustering.py +++ b/autoparallel/graph_passes/graph_clustering.py @@ -65,18 +65,17 @@ def _prepare_op_strategy(op_strategy): return str(op_strategy) -def _hash_node(node, strategies, input_pickler): +def _hash_node(node, strategies, input_pickler, op_str): + # op_str caches _prepare_op_strategy(strategies[n]) per node: each node's + # (large, 3D-mesh) strategy string is otherwise rebuilt once as self plus + # once per consumer, dominating clustering time on deep models. key = ( str(node.target), node.meta.get("partitioner_tag"), node.meta.get("stack_trace"), _normalize_args(node), - _prepare_op_strategy(strategies[node]), - tuple( - _prepare_op_strategy(strategies[s]) - for s in node.all_input_nodes - if s in strategies - ), + op_str[node], + tuple(op_str[s] for s in node.all_input_nodes if s in strategies), ) return sha256_hash(input_pickler.dumps(key)) @@ -107,6 +106,7 @@ def get_identical_regions( hash_to_duplicates: dict[str, IdenticalNodes] = defaultdict(list) node_to_duplicates: dict[Node, IdenticalNodes] = {} t = time.time() + op_str = {n: _prepare_op_strategy(s) for n, s in strategies.items()} for node in graph.nodes: if node.op == "placeholder": continue @@ -115,7 +115,9 @@ def get_identical_regions( # HOP submodule get_attr nodes are not in strategies. continue - duplicates = hash_to_duplicates[_hash_node(node, strategies, input_pickler)] + duplicates = hash_to_duplicates[ + _hash_node(node, strategies, input_pickler, op_str) + ] duplicates.append(node) node_to_duplicates[node] = duplicates logger.debug(f"Hashed nodes in {time.time() - t} s") diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 1620be87..9e73889e 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -1209,6 +1209,11 @@ def validate(self): continue if node not in self.strats: continue + # Cluster copies are structurally identical to their root (same + # strategies and input structure, asserted in create_cluster_links), + # so validating the root covers them. + if self.node_map[node] in self.cluster_links: + continue strat = self.strats[node] strat0 = strat.strategies[0] all_input_nodes = self._all_input_nodes(node) From 2ce86ea934cfbfd30ac0f51d28f89c91b6da6c09 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 21:18:43 -0700 Subject: [PATCH 20/27] Add approx-solver diagnostic + accuracy benchmarks Helper scripts used to diagnose and validate the TRW-S fix: factor-graph faithfulness/representability check, LP integrality check, hyperparameter and iterated-local-search sweeps, a standalone TRW-S prototype, an annotation ablation, and a per-phase build profiler. Authored with Claude. --- examples/_bench_anno.py | 116 ++++++++++++++++++++ examples/_bench_approx_diag.py | 173 ++++++++++++++++++++++++++++++ examples/_bench_approx_ils.py | 136 +++++++++++++++++++++++ examples/_bench_approx_sweep.py | 106 ++++++++++++++++++ examples/_bench_build_profile.py | 19 +++- examples/_bench_lp_integrality.py | 118 ++++++++++++++++++++ examples/_bench_trws.py | 173 ++++++++++++++++++++++++++++++ 7 files changed, 838 insertions(+), 3 deletions(-) create mode 100644 examples/_bench_anno.py create mode 100644 examples/_bench_approx_diag.py create mode 100644 examples/_bench_approx_ils.py create mode 100644 examples/_bench_approx_sweep.py create mode 100644 examples/_bench_lp_integrality.py create mode 100644 examples/_bench_trws.py diff --git a/examples/_bench_anno.py b/examples/_bench_anno.py new file mode 100644 index 00000000..45e546ff --- /dev/null +++ b/examples/_bench_anno.py @@ -0,0 +1,116 @@ +"""prune+dp+annotation (the full joint config) vs prune+dp alone, compared to a +known optimum/LP lower bound. Lite build + optional TP-plan annotation + approx. +Env: MODEL, MESH, SEQLEN, LP_LB.""" +import logging +import os +import time +from unittest.mock import patch + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "70b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +LP_LB = float(os.environ.get("LP_LB", "0")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, + max_seq_len=SEQLEN, **_CFG[MODEL]) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),) +ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),) + + +def annotate_tp_plan(autop): + for proj in ["wq", "wk", "wv"]: + autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL) + autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL) + for proj in ["w1", "w3"]: + autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL) + autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL) + + +def constrain(autop): + x = (Shard(0),) + (Replicate(),) * (ndim - 1) + out = (Shard(0), Shard(2)) if ndim == 2 else x + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x]) + autop.add_output_constraints([out]) + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### anno MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True) + + +def gap(o): + return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan") + + +# prune+dp (no annotation) +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") +autop.__enter__() +constrain(autop) +build_s = time.perf_counter() - t +opt = autop.sharding_optimizer +t = time.perf_counter() +ApproximateShardingSolver(opt).get_solution(verbose=False) +dp_s = time.perf_counter() - t +obj_dp = opt.profile["approximate"]["objective"] +print(f"[dp] build={build_s:.1f}s approx={dp_s:.1f}s obj={obj_dp:.1f} gap={gap(obj_dp):+.2f}%", flush=True) + +# + annotation +t = time.perf_counter() +annotate_tp_plan(autop) +prop = autop.propagate_annotations(verbose=False, method="fix") +prop_s = time.perf_counter() - t +t = time.perf_counter() +ApproximateShardingSolver(opt).get_solution(verbose=False) +ann_s = time.perf_counter() - t +obj_ann = opt.profile["approximate"]["objective"] +print(f"[dp+anno] build={build_s:.1f}s propagate={prop_s:.1f}s approx={ann_s:.1f}s " + f"total={build_s+prop_s+ann_s:.1f}s obj={obj_ann:.1f} gap={gap(obj_ann):+.2f}% " + f"(pinned {prop.nodes_determined} nodes)", flush=True) +print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} dp_gap={gap(obj_dp):+.2f}% " + f"dp+anno_gap={gap(obj_ann):+.2f}% dp+anno_total={build_s+prop_s+ann_s:.1f}s", flush=True) diff --git a/examples/_bench_approx_diag.py b/examples/_bench_approx_diag.py new file mode 100644 index 00000000..25de4d85 --- /dev/null +++ b/examples/_bench_approx_diag.py @@ -0,0 +1,173 @@ +"""Diagnose the bare approx gap: is the factor graph FAITHFUL (scores the true +optimum correctly -> solver is at fault) or UNFAITHFUL (drops cost -> model is at +fault), and is the optimum REPRESENTABLE in the group choices (pruning)? + +Builds the ILP, solves it exactly with CBC, then checks whether the approx's own +machinery (total_objective + factor graph) reproduces the CBC optimum, and where +the approx's own solution differs. Env: MODEL, MESH, SEQLEN.""" +import logging +import os +import time +from collections import defaultdict +from unittest.mock import patch + +import pulp +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "1b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, + max_seq_len=SEQLEN, **_CFG[MODEL]) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +def constrain(autop): + x = (Shard(0),) + (Replicate(),) * (ndim - 1) + out = (Shard(0), Shard(2)) if ndim == 2 else x + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x]) + autop.add_output_constraints([out]) + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### diag MODEL={MODEL} mesh={MESH_SHAPE}{names} ###", flush=True) + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") +autop.__enter__() +constrain(autop) +opt = autop.sharding_optimizer +print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) + +opt._set_objective() +opt._apply_memory_constraint() +t = time.perf_counter() +opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"])) +obj_cbc = pulp.value(opt.prob.objective) +print(f"[cbc] solve={time.perf_counter()-t:.1f}s obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]}", flush=True) + +# CBC per-(root)node chosen out_idx +cbc_out = {} +for key, var in opt.pulp_variables.items(): + v = var.varValue + if v is not None and v > 0.5: + cbc_out[key[0]] = key[2] + +approx = ApproximateShardingSolver(opt) +approx._build_problem() +approx._build_factors() + +# (A) FAITHFULNESS: exact objective of the CBC solution via the approx machinery. +approx.cur_out = dict(cbc_out) +e_cbc_total = approx.total_objective() +print(f"[faithful] approx.total_objective(CBC soln) = {e_cbc_total:.1f} " + f"(CBC obj {obj_cbc:.1f}; match={abs(e_cbc_total-obj_cbc)<1.0})", flush=True) + +# (B) REPRESENTABILITY: can the group choices express the CBC solution? +cbc_full = dict(cbc_out) +for copy_idx, root_idx in opt.cluster_links.items(): + if root_idx in cbc_out: + cbc_full[copy_idx] = cbc_out[root_idx] +unrep = [] +cbc_group_choice = {} +for gid, g in enumerate(approx.groups): + found = None + for ci, choice in enumerate(g.choices): + if all(cbc_full.get(m) == o for m, o in choice.items()): + found = ci + break + if found is None: + unrep.append(gid) + else: + cbc_group_choice[gid] = found +print(f"[representable] groups={len(approx.groups)} " + f"with_no_matching_choice={len(unrep)}", flush=True) + +# (C) factor-graph energy of the CBC solution (if representable) +if not unrep: + for gid, ci in cbc_group_choice.items(): + approx._set_group(gid, ci) + fge = approx._fast_total_energy() + print(f"[fg-energy] _fast_total_energy(CBC soln) = {fge:.1f} " + f"(match CBC {abs(fge-obj_cbc)<1.0})", flush=True) + +# (D) run the normal approx, localize where it differs from CBC +approx2 = ApproximateShardingSolver(opt) +approx2.get_solution(verbose=False) +obj_approx = opt.profile["approximate"]["objective"] +ax_out = dict(approx2.cur_out) +print(f"[approx] obj={obj_approx:.1f} gap={100*(obj_approx-obj_cbc)/obj_cbc:+.2f}%", flush=True) + +# per-node exact cost under each assignment (cost_bearing nodes), to localize gap +def node_cost(solver, out_map, v): + o = out_map[v] + node = opt.nodes[v] + strat = opt.strats[node].strategies[o] + prod = solver._arg_prod.get(v, {}) + c = 0.0 + for argi in range(len(strat.redistribute_cost)): + p = prod.get(argi) + inp = out_map[p] if (p is not None and p in out_map) else 0 + key = (v, argi, o, inp) + dv = opt.decision_vars.get(key) + if dv is None: + return None + c += dv.cost + return solver.node_mult[v] * c + +diffs = [] +for v in approx2.cost_bearing: + if cbc_out.get(v) != ax_out.get(v): + c_cbc = node_cost(approx2, cbc_out, v) + c_ax = node_cost(approx2, ax_out, v) + if c_cbc is not None and c_ax is not None: + diffs.append((c_ax - c_cbc, v, opt.nodes[v].name, cbc_out.get(v), ax_out.get(v))) +diffs.sort(reverse=True) +print(f"[localize] {len(diffs)} cost-bearing nodes differ; top contributors (approx-cbc):", flush=True) +for d, v, name, oc, oa in diffs[:15]: + print(f" +{d:10.1f} node={name[:40]:40s} cbc_out={oc} approx_out={oa}", flush=True) +tot = sum(d for d, *_ in diffs) +print(f"[localize] total node-cost diff over differing nodes = {tot:.1f} " + f"(gap = {obj_approx-obj_cbc:.1f})", flush=True) diff --git a/examples/_bench_approx_ils.py b/examples/_bench_approx_ils.py new file mode 100644 index 00000000..d6e1b437 --- /dev/null +++ b/examples/_bench_approx_ils.py @@ -0,0 +1,136 @@ +"""Diagnose whether the approx solver's objective is stuck in a local-optimum +basin that a stronger search escapes. Build once, run the stock BP+localsearch, +then run iterated local search (perturb a random set of groups, re-optimize, +keep best) for a time budget. If ILS beats the stock objective meaningfully, the +gap is a move-set/init weakness (and the LP bound is ~reachable); if not, 607260 +is robust. Env: MODEL, MESH, SEQLEN, LP_LB, ILS_S.""" +import logging +import os +import random +import time +from unittest.mock import patch + +import numpy as np +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "70b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +LP_LB = float(os.environ.get("LP_LB", "0")) +ILS_S = float(os.environ.get("ILS_S", "180")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, + max_seq_len=SEQLEN, **_CFG[MODEL]) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### ILS MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ils_s={ILS_S} ###", flush=True) + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") +autop.__enter__() +x = (Shard(0),) + (Replicate(),) * (ndim - 1) +out = (Shard(0), Shard(2)) if ndim == 2 else x +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x]) +autop.add_output_constraints([out]) +opt = autop.sharding_optimizer +print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) + + +def gap(o): + return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan") + + +s = ApproximateShardingSolver(opt) +s._build_problem() +s._build_factors() +G = len(s.groups) +domains = [g.domain for g in s.groups] +multi = [d for d in domains if d > 1] +edges = len(s.C) +print(f"[graph] groups={G} multi_choice_groups={len(multi)} " + f"max_domain={max(domains)} sum_domain={sum(domains)} pair_edges={edges}", flush=True) + +# Stock solve (BP + local search), mirrors _solve's BP candidate. +deadline = time.perf_counter() + 1e9 +s._belief_propagation() +s._memory_repair() +s._coordinate_descent(deadline) +s._star_block_search(deadline) +stock = s._fast_total_energy() +best = stock +best_snap = [g.current for g in s.groups] +print(f"[stock] bp+cd+star energy={stock:.1f} gap={gap(stock):+.2f}%", flush=True) + +# Iterated local search: perturb k random multi-choice groups, re-optimize, keep best. +rng = random.Random(0) +multi_gids = [g for g in range(G) if s.groups[g].domain > 1] +t0 = time.perf_counter() +iters = 0 +accepts = 0 +while time.perf_counter() - t0 < ILS_S: + iters += 1 + # restore best, then kick + for gid, ci in enumerate(best_snap): + s._set_group(gid, ci) + k = rng.randint(1, max(2, len(multi_gids) // 10)) + for gid in rng.sample(multi_gids, min(k, len(multi_gids))): + s._set_group(gid, rng.randrange(s.groups[gid].domain)) + s._memory_repair() + s._coordinate_descent(deadline) + s._star_block_search(deadline) + e = s._fast_total_energy() + if e < best - 1e-6: + best = e + best_snap = [g.current for g in s.groups] + accepts += 1 + print(f"[ils] iter={iters} NEW BEST energy={best:.1f} gap={gap(best):+.2f}% " + f"(k={k})", flush=True) + +for gid, ci in enumerate(best_snap): + s._set_group(gid, ci) +exact = s._write_back() +print(f"[ILS done] iters={iters} accepts={accepts} stock={stock:.1f} " + f"best={best:.1f} exact_obj={exact:.1f} gap={gap(exact):+.2f}% " + f"(improvement vs stock = {100*(stock-best)/stock:.2f}%)", flush=True) diff --git a/examples/_bench_approx_sweep.py b/examples/_bench_approx_sweep.py new file mode 100644 index 00000000..3d73a070 --- /dev/null +++ b/examples/_bench_approx_sweep.py @@ -0,0 +1,106 @@ +"""Build one model (lite) once, then run ApproximateShardingSolver under several +hyperparameter configs to see whether the objective gap (vs a known LP lower +bound) is closable by tuning (candidate pruning / BP iters / time / local search) +or is structural. Env: MODEL, MESH, SEQLEN, LP_LB (reference lower bound).""" +import logging +import os +import time +from unittest.mock import patch + +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "70b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +LP_LB = float(os.environ.get("LP_LB", "0")) +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs( + rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL]) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### approx sweep MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True) + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") +autop.__enter__() +x = (Shard(0),) + (Replicate(),) * (ndim - 1) +out = (Shard(0), Shard(2)) if ndim == 2 else x +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x]) +autop.add_output_constraints([out]) +opt = autop.sharding_optimizer +print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) + +CONFIGS = [ + ("default", dict()), + ("cand=256", dict(candidate_limit=256)), + ("cand=None", dict(candidate_limit=None)), + ("bp=100", dict(bp_iters=100)), + ("sweeps=200,star=20,t=600", dict(max_sweeps=200, star_passes=20, max_time_s=600)), + ("star_children=64,domain=4096", dict(max_star_children=64, group_domain_limit=4096)), + ("ALL generous", dict(candidate_limit=None, bp_iters=100, max_sweeps=200, + star_passes=20, max_time_s=900, max_star_children=64, + group_domain_limit=4096)), +] + +best = None +for name, cfg in CONFIGS: + t = time.perf_counter() + solver = ApproximateShardingSolver(opt, **cfg) + solver.get_solution(verbose=False) + dt = time.perf_counter() - t + ap = opt.profile["approximate"] + obj = ap["objective"] + gap = 100 * (obj - LP_LB) / LP_LB if LP_LB else float("nan") + winner = "bp" if ap["bp_energy"] <= ap["greedy_energy"] else "greedy" + print(f"[cfg] {name:30s} obj={obj:.1f} gap={gap:+.2f}% " + f"bp={ap['bp_energy']:.1f} greedy={ap['greedy_energy']:.1f} win={winner} " + f"t={dt:.1f}s", flush=True) + if best is None or obj < best[1]: + best = (name, obj) + +print(f"[BEST] {best[0]} obj={best[1]:.1f} " + f"gap={100*(best[1]-LP_LB)/LP_LB:+.2f}%" if LP_LB else f"[BEST] {best[0]} obj={best[1]:.1f}", + flush=True) diff --git a/examples/_bench_build_profile.py b/examples/_bench_build_profile.py index 82b31bd6..03b6a2c9 100644 --- a/examples/_bench_build_profile.py +++ b/examples/_bench_build_profile.py @@ -18,6 +18,13 @@ from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config logging.basicConfig(level=logging.ERROR) +if os.environ.get("DEBUG_CLUSTER") == "1": + h = logging.StreamHandler() + h.setLevel(logging.DEBUG) + for nm in ("autoparallel.graph_passes.graph_clustering", "autoparallel.optimize_sharding"): + lg = logging.getLogger(nm) + lg.setLevel(logging.DEBUG) + lg.addHandler(h) for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), ("get_device_capability", lambda *a, **k: (9, 0))]: patch(f"torch.cuda.{fn}", val).start() @@ -25,8 +32,15 @@ "P", (), {"major": 9, "minor": 0, "name": "H100", "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() +MODEL = os.environ.get("MODEL", "1b") SEQLEN = int(os.environ.get("SEQLEN", "2048")) MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} ws = 1 for d in MESH_SHAPE: ws *= d @@ -40,8 +54,7 @@ def model_fn(): args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, - multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN) + rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL]) with torch.device("meta"): return Transformer(args) @@ -52,7 +65,7 @@ def input_fn(): set_nccl_topo_config(detect_nccl_topo_config(mesh)) mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"=== build profile: mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True) +print(f"=== build profile: MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True) t = time.perf_counter() autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") diff --git a/examples/_bench_lp_integrality.py b/examples/_bench_lp_integrality.py new file mode 100644 index 00000000..1c95b7e1 --- /dev/null +++ b/examples/_bench_lp_integrality.py @@ -0,0 +1,118 @@ +"""Re-solve the 70B LP relaxation and report how integral the optimum is: count +fractional variables in the HiGHS solution. If ~all variables are 0/1, the LP +optimum is reachable by integers (so an approx gap is a real solver failure); if +many are fractional, the LP bound is loose (and the approx may be near-optimal). +Also reports the objective with the memory constraint dropped, to test whether +the memory budget is the fractionality source. Env: MODEL, MESH, SEQLEN.""" +import logging +import os +import time +from unittest.mock import patch + +import numpy as np +import pulp +import scipy.sparse as sp +import torch +from scipy.optimize import linprog +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "70b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) +DROP_MEM = os.environ.get("DROP_MEM", "0") == "1" +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, + max_seq_len=SEQLEN, **_CFG[MODEL]) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### LP integrality MODEL={MODEL} mesh={MESH_SHAPE}{names} drop_mem={DROP_MEM} ###", flush=True) + +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") +autop.__enter__() +x = (Shard(0),) + (Replicate(),) * (ndim - 1) +out = (Shard(0), Shard(2)) if ndim == 2 else x +autop.add_parameter_memory_constraint(low=None, high=None) +autop.add_input_constraints([x]) +autop.add_output_constraints([out]) +opt = autop.sharding_optimizer +print(f"[build] full_build={time.perf_counter()-t:.1f}s", flush=True) + +opt._set_objective() +if not DROP_MEM: + opt._apply_memory_constraint() +variables = opt.prob.variables() +vidx = {id(v): i for i, v in enumerate(variables)} +n = len(variables) +c = np.zeros(n) +for key, dv in opt.decision_vars.items(): + mult = 1 + len(opt._root_to_copies.get(key[0], ())) + c[vidx[id(dv.var)]] += dv.cost * mult +re = ru = 0 +reqr, reqc, reqd, beq = [], [], [], [] +rubr, rubc, rubd, bub = [], [], [], [] +for con in opt.prob.constraints.values(): + rhs = -con.constant + if con.sense == pulp.LpConstraintEQ: + for v, co in con.items(): + reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co) + beq.append(rhs); re += 1 + else: + sgn = 1.0 if con.sense == pulp.LpConstraintLE else -1.0 + for v, co in con.items(): + rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(sgn * co) + bub.append(sgn * rhs); ru += 1 +A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None +A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None +t = time.perf_counter() +res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None), + bounds=(0, 1), method="highs-ds", options={"disp": True}) +print(f"[lp] solve={time.perf_counter()-t:.1f}s status={res.message}", flush=True) +xv = res.x +freq = np.abs(xv - np.round(xv)) +nfrac = int((freq > 1e-6).sum()) +nfrac4 = int((freq > 1e-4).sum()) +# weight fractionality by objective contribution to see if it matters +frac_obj = float(np.abs(c * freq).sum()) +print(f"[RESULT] MODEL={MODEL} drop_mem={DROP_MEM} obj={res.fun:.1f} " + f"vars={n} fractional(>1e-6)={nfrac} ({100*nfrac/n:.4f}%) " + f"fractional(>1e-4)={nfrac4} frac_obj_weight={frac_obj:.1f}", flush=True) diff --git a/examples/_bench_trws.py b/examples/_bench_trws.py new file mode 100644 index 00000000..4e4fbc2d --- /dev/null +++ b/examples/_bench_trws.py @@ -0,0 +1,173 @@ +"""Prototype TRW-S (tree-reweighted sequential message passing) on the approx +solver's faithful factor graph, validated against the CBC-exact optimum. If TRW-S +(optionally + the existing local search) reaches the optimum where plain min-sum +BP does not, it is the fix. Env: MODEL, MESH, SEQLEN, ITERS.""" +import logging +import os +import time +from unittest.mock import patch + +import numpy as np +import pulp +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) +for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), + ("get_device_capability", lambda *a, **k: (9, 0))]: + patch(f"torch.cuda.{fn}", val).start() +patch("torch.cuda.get_device_properties", lambda *a, **k: type( + "P", (), {"major": 9, "minor": 0, "name": "H100", + "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() + +MODEL = os.environ.get("MODEL", "1b") +SEQLEN = int(os.environ.get("SEQLEN", "2048")) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) +ITERS = int(os.environ.get("ITERS", "1000")) +USE_CBC = os.environ.get("CBC", "1") == "1" +ws = 1 +for d in MESH_SHAPE: + ws *= d +names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] +torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) +mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) +ndim = mesh.ndim +vocab_size = 128256 +batch_size = 2 * mesh.shape[0] +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), + "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), +} + + +def model_fn(): + args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, + max_seq_len=SEQLEN, **_CFG[MODEL]) + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") + + +def constrain(autop): + x = (Shard(0),) + (Replicate(),) * (ndim - 1) + out = (Shard(0), Shard(2)) if ndim == 2 else x + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x]) + autop.add_output_constraints([out]) + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) +print(f"### TRW-S MODEL={MODEL} mesh={MESH_SHAPE}{names} iters={ITERS} ###", flush=True) + +backend = "ilp" if USE_CBC else "approx" +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver=backend) +autop.__enter__() +constrain(autop) +opt = autop.sharding_optimizer +print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) + +obj_cbc = None +if USE_CBC: + opt._set_objective() + opt._apply_memory_constraint() + t = time.perf_counter() + opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"])) + obj_cbc = pulp.value(opt.prob.objective) + print(f"[cbc] obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]} " + f"({time.perf_counter()-t:.1f}s)", flush=True) + + +_REF = obj_cbc if obj_cbc else float(os.environ.get("LP_LB", "0")) or None + + +def gap(o): + return 100 * (o - _REF) / _REF if _REF else float("nan") + + +# Stock approx (BP + local search) for comparison. +a0 = ApproximateShardingSolver(opt) +t = time.perf_counter() +a0.get_solution(verbose=False) +print(f"[stock approx] obj={opt.profile['approximate']['objective']:.1f} " + f"gap={gap(opt.profile['approximate']['objective']):+.2f}% ({time.perf_counter()-t:.1f}s)", flush=True) + +# Build a fresh factor graph for TRW-S. +A = ApproximateShardingSolver(opt) +A._build_problem() +A._build_factors() +G = len(A.groups) +nbrs = A.nbrs +unary = A.g_unary +order = sorted(range(G), key=lambda g: min(A.groups[g].members)) +pos = [0] * G +for i, g in enumerate(order): + pos[g] = i +gamma = [] +for g in range(G): + indeg = sum(1 for h in nbrs[g] if pos[h] < pos[g]) + outdeg = sum(1 for h in nbrs[g] if pos[h] > pos[g]) + gamma.append(1.0 / max(1, max(indeg, outdeg))) + +msg = {} +for g in range(G): + for h in nbrs[g]: + msg[(g, h)] = np.zeros(len(unary[h])) + +t = time.perf_counter() +best = float("inf") +best_snap = None +for it in range(ITERS): + for forward in (True, False): + seq = order if forward else order[::-1] + for p in seq: + if not nbrs[p]: + continue + agg = unary[p].copy() + for r in nbrs[p]: + agg += msg[(r, p)] + wp = gamma[p] * agg + for q in nbrs[p]: + if (pos[q] > pos[p]) != forward: + continue + P = A._pair_matrix(p, q) # (D_p, D_q) + mm = (wp - msg[(q, p)])[:, None] + P + mq = mm.min(axis=0) + mq -= mq.min() + msg[(p, q)] = mq + A._decode(msg) + e = A._fast_total_energy() + if e < best - 1e-6: + best = e + best_snap = [g.current for g in A.groups] + if it < 5 or it % 50 == 0: + print(f" [trws it={it}] decode_energy={e:.1f} best={best:.1f} gap={gap(best):+.2f}%", flush=True) +trws_s = time.perf_counter() - t +for gid, ci in enumerate(best_snap): + A._set_group(gid, ci) +print(f"[TRW-S] best={best:.1f} gap={gap(best):+.2f}% ({trws_s:.1f}s, {ITERS} iters)", flush=True) + +# Polish TRW-S result with the existing local search. +deadline = time.perf_counter() + 60 +A._memory_repair() +A._coordinate_descent(deadline) +A._star_block_search(deadline) +polished = A._fast_total_energy() +print(f"[TRW-S + local search] obj={polished:.1f} gap={gap(polished):+.2f}%", flush=True) +print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} cbc={obj_cbc} " + f"stock_gap={gap(opt.profile['approximate']['objective']):+.2f}% " + f"trws_gap={gap(best):+.2f}% trws_ls_gap={gap(polished):+.2f}%", flush=True) From ba02ea52b73688f997b88267b76dd807947bfa1e Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 31 May 2026 23:17:02 -0700 Subject: [PATCH 21/27] Add real-GPU LLaMA3 training sanity check; drop stale loss-curve symlinks examples/_sanity_llama3.py traces LLaMA3, selects a strategy with the approximate (TRW-S) solver, applies it as DTensor, and trains a fixed random batch for a few steps on real GPUs over a 2D or 3D mesh, verifying the loss curve descends. Also removes three dangling loss-curve symlinks left over from an earlier run. Authored with Claude. --- examples/_sanity_llama3.py | 215 +++++++++++++++++++ qwen3_8b_autoparallel_30steps_loss_curve.png | 1 - qwen3_8b_autoparallel_30steps_loss_curve.svg | 1 - qwen3_8b_autoparallel_30steps_losses.csv | 1 - 4 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 examples/_sanity_llama3.py delete mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.png delete mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.svg delete mode 120000 qwen3_8b_autoparallel_30steps_losses.csv diff --git a/examples/_sanity_llama3.py b/examples/_sanity_llama3.py new file mode 100644 index 00000000..71fc1122 --- /dev/null +++ b/examples/_sanity_llama3.py @@ -0,0 +1,215 @@ +"""Real LLaMA3 AutoParallel training sanity check on a 2D or 3D mesh. + +Traces the model, picks a sharding strategy with the approximate (TRW-S) solver, +applies it as DTensor, and trains a fixed random batch for a few steps on real +GPUs. Pass: the loss curve goes down. Adapted from example_sanity_check_qwen3.py. + +The batch is data-parallel on the `dp` axis only; any other axes (`cp`, `tp`) +are model-sharding axes (the solver shards params/activations over them). Logits +are vocab-parallel on `tp` and replicated on `cp`, so the loss is reduced over +the world and normalized by global_token_count * (world_size // dp_degree). + +Run: torchrun --standalone --nproc-per-node N examples/_sanity_llama3.py --mesh 2,2,8 --model 8b +""" +import argparse +import logging +import os +import time + +import torch +import torch.distributed as dist +import torch.distributed.nn.functional as dist_nn_func +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel + +_CFG = { + "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), + "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), +} +_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")} + + +def parse_args(): + p = argparse.ArgumentParser(description="LLaMA3 AutoParallel training sanity check.") + p.add_argument("--model", type=str, default="1b", choices=list(_CFG)) + p.add_argument("--mesh", type=str, default="2,2", help="comma-separated mesh dims") + p.add_argument("--global-batch-size", type=int, default=8) + p.add_argument("--microbatch-size", type=int, default=2) + p.add_argument("--seq-len", type=int, default=512) + p.add_argument("--train-steps", type=int, default=10) + p.add_argument("--lr", type=float, default=1e-3) + p.add_argument("--max-grad-norm", type=float, default=1.0) + p.add_argument("--seed", type=int, default=0) + p.add_argument("--solver", type=str, default="approx") + p.add_argument("--verbose", action="store_true") + return p.parse_args() + + +def init_distributed(args): + if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: + raise RuntimeError("Run with torchrun --standalone --nproc-per-node N ...") + world_size = int(os.environ["WORLD_SIZE"]) + local_rank = int(os.environ["LOCAL_RANK"]) + dims = tuple(int(x) for x in args.mesh.split(",")) + prod = 1 + for d in dims: + prod *= d + if prod != world_size: + raise ValueError(f"WORLD_SIZE {world_size} != prod(mesh) {prod}") + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + dist.init_process_group("nccl", device_id=device) + mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", dims, mesh_dim_names=_NAMES[len(dims)] + ) + return device, mesh + + +def placement_for(name, *, is_output): + if name == "dp": + return Shard(0) + if name == "tp" and is_output: + return Shard(2) + return Replicate() + + +def make_local_tokens(args, mesh, device, vocab_size): + names = mesh.mesh_dim_names + dp_rank = mesh.get_coordinate()[names.index("dp")] + dp_degree = mesh["dp"].size() + local_batch_size = args.global_batch_size // dp_degree + gen = torch.Generator(device="cpu") + gen.manual_seed(args.seed) + tokens = torch.randint( + 0, vocab_size, (args.global_batch_size, args.seq_len + 1), + generator=gen, dtype=torch.long, + ) + start = dp_rank * local_batch_size + return tokens[start:start + local_batch_size].to(device, non_blocking=True) + + +def vocab_parallel_cross_entropy(logits, labels, *, vocab_size, tp_group, tp_rank, + tp_degree, normalizer): + local_vocab_size = logits.shape[-1] + vocab_start = tp_rank * local_vocab_size + vocab_stop = vocab_size if tp_rank == tp_degree - 1 else vocab_start + local_vocab_size + logits = logits.float() + local_max = logits.amax(dim=-1) + with torch.no_grad(): + global_max = local_max.detach().clone() + dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group) + shifted = logits - global_max.unsqueeze(-1) + global_exp_sum = dist_nn_func.all_reduce( + shifted.exp().sum(dim=-1), op=dist.ReduceOp.SUM, group=tp_group) + mask = (labels >= vocab_start) & (labels < vocab_stop) + local_target = torch.zeros_like(labels, dtype=torch.long) + local_target[mask] = labels[mask] - vocab_start + local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) + local_target_logits = local_target_logits * mask.to(logits.dtype) + target_logits = dist_nn_func.all_reduce( + local_target_logits, op=dist.ReduceOp.SUM, group=tp_group) + loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() + return loss_sum / normalizer + + +def print_rank0(msg): + if dist.get_rank() == 0: + print(msg, flush=True) + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING) + device, mesh = init_distributed(args) + names = mesh.mesh_dim_names + world_size = dist.get_world_size() + tp_group = mesh.get_group("tp") + tp_rank = mesh.get_local_rank("tp") + tp_degree = mesh["tp"].size() + dp_degree = mesh["dp"].size() + local_batch_size = args.global_batch_size // dp_degree + grad_accum = local_batch_size // args.microbatch_size + # logits are distinct only across dp (cp/tp replicate the per-token loss), + # so the world all-reduce over-counts by world_size // dp_degree. + normalizer = args.global_batch_size * args.seq_len * (world_size // dp_degree) + + torch.manual_seed(args.seed) + model_args = TransformerModelArgs( + rope_theta=500000, vocab_size=128256, max_seq_len=args.seq_len, **_CFG[args.model], + ) + trace_global_batch = args.microbatch_size * dp_degree + + with torch.device("meta"): + model = Transformer(model_args) + + def input_fn(): + return torch.randint(0, model_args.vocab_size, + (trace_global_batch, args.seq_len), device=device) + + mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + x_sharding = tuple(placement_for(n, is_output=False) for n in names) + out_sharding = tuple(placement_for(n, is_output=True) for n in names) + print_rank0(f"LLaMA3-{args.model} sanity: mesh={tuple(mesh.shape)}{names} " + f"solver={args.solver} in={x_sharding} out={out_sharding} " + f"global_batch={args.global_batch_size} microbatch={args.microbatch_size} " + f"grad_accum={grad_accum} seq_len={args.seq_len} steps={args.train_steps} lr={args.lr}") + + t0 = time.time() + with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True, + solver=args.solver) as autop: + autop.add_parameter_memory_constraint(low=None, high=None) + autop.add_input_constraints([x_sharding]) + autop.add_output_constraints([out_sharding]) + sharding_placement = autop.optimize_placement(verbose=args.verbose) + parallel_mod = autop.apply_placement(sharding_placement) + print_rank0(f"trace+optimize+apply took {time.time() - t0:.1f}s") + + parallel_mod.to_empty(device=device) + parallel_mod.init_weights(buffer_device=device) + + batch = make_local_tokens(args, mesh, device, model_args.vocab_size) + inputs = batch[:, :-1].contiguous() + labels = batch[:, 1:].contiguous() + input_mbs = inputs.split(args.microbatch_size, dim=0) + label_mbs = labels.split(args.microbatch_size, dim=0) + optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) + + try: + losses = [] + for step in range(args.train_steps): + optimizer.zero_grad(set_to_none=True) + step_loss = torch.zeros((), device=device) + for mi, ml in zip(input_mbs, label_mbs): + logits = parallel_mod(mi) + if torch.any(torch.isnan(logits)): + raise RuntimeError("NaN in forward output") + loss = vocab_parallel_cross_entropy( + logits, ml, vocab_size=model_args.vocab_size, tp_group=tp_group, + tp_rank=tp_rank, tp_degree=tp_degree, normalizer=normalizer) + if torch.any(torch.isnan(loss)): + raise RuntimeError("NaN in loss") + loss.backward() + step_loss = step_loss + loss.detach() + torch.nn.utils.clip_grad_norm_(parallel_mod.parameters(), args.max_grad_norm) + optimizer.step() + with torch.no_grad(): + logged = step_loss.clone() + dist.all_reduce(logged, op=dist.ReduceOp.SUM) + losses.append(float(logged.item())) + print_rank0(f"step={step:03d} loss={losses[-1]:.6f}") + + print_rank0(f"\nloss curve: {[round(x, 4) for x in losses]}") + verdict = "PASS" if losses[-1] < losses[0] else "FAIL" + print_rank0(f"SANITY {verdict}: loss {losses[0]:.4f} -> {losses[-1]:.4f}") + dist.barrier(device_ids=[device.index]) + torch.cuda.synchronize(device) + finally: + if dist.is_initialized(): + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.png b/qwen3_8b_autoparallel_30steps_loss_curve.png deleted file mode 120000 index c8413f8d..00000000 --- a/qwen3_8b_autoparallel_30steps_loss_curve.png +++ /dev/null @@ -1 +0,0 @@ -/tmp/qwen3_8b_autoparallel_30steps_loss_curve.png \ No newline at end of file diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.svg b/qwen3_8b_autoparallel_30steps_loss_curve.svg deleted file mode 120000 index babd3d4e..00000000 --- a/qwen3_8b_autoparallel_30steps_loss_curve.svg +++ /dev/null @@ -1 +0,0 @@ -/tmp/qwen3_8b_autoparallel_30steps_loss_curve.svg \ No newline at end of file diff --git a/qwen3_8b_autoparallel_30steps_losses.csv b/qwen3_8b_autoparallel_30steps_losses.csv deleted file mode 120000 index 47d30691..00000000 --- a/qwen3_8b_autoparallel_30steps_losses.csv +++ /dev/null @@ -1 +0,0 @@ -/tmp/qwen3_8b_autoparallel_30steps_losses.csv \ No newline at end of file From cbd95757b785e94918ef2869badcf48de990c7a1 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Mon, 1 Jun 2026 10:14:17 -0700 Subject: [PATCH 22/27] Drop committed loss-curve/profiling artifacts; ignore png/svg/csv Authored with Claude. --- .gitignore | 3 ++ qwen3_moe_mast_20steps_loss_curve.png | Bin 19666 -> 0 bytes qwen3_moe_mast_20steps_loss_curve.svg | 68 -------------------------- qwen3_moe_mast_20steps_losses.csv | 21 -------- 4 files changed, 3 insertions(+), 89 deletions(-) delete mode 100644 qwen3_moe_mast_20steps_loss_curve.png delete mode 100644 qwen3_moe_mast_20steps_loss_curve.svg delete mode 100644 qwen3_moe_mast_20steps_losses.csv diff --git a/.gitignore b/.gitignore index 1a6228f1..4936ecca 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,9 @@ *.egg-info/ *.pdf +*.png +*.svg +*.csv build/ dist/ diff --git a/qwen3_moe_mast_20steps_loss_curve.png b/qwen3_moe_mast_20steps_loss_curve.png deleted file mode 100644 index 8b4d9c43f227e00009f42077c6b257e19591586a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19666 zcmch<1z1&GyEZxj2|+@60hJO1q(!6~MU)nh?vzHlK><;bl5Ujl?pPq*5{piyVbL9n zf2`%Zzx{pR-us;YKj&P>%j;sU8Doxkp8L6<81s{YoFpOsZF~rV2w%QBa_P3-zwq3m-xUUCKfAl ztLugl-rbKf<9HvhVW^iFZ?U>7b+P5LDwt6GJUIKk{qg1PuiU+hUq|0r*LOQWXGA0MO27|=2milaa(DW@ zWTh*cvR|mVC5GRC2L;-z)IhuizLLQ<6E*9@v3ta{Lcu|S-BIvTy>}k($hym|;QJTi z*pUT&*RB)OAg1i~`jO3*TQlC^73o?6v$7s^k8aF3AGh`XT#30k+7l}zC}?NY zf2i}$1DjxJYdk;IS&p6VnORg4|U!3Ct%bh zPWkIS%U>kOxp^L}56Qfqa_-B&8RTh2T-^@O(`U(%|Lw2sqPwZ-1TGFzzRzku5TppFYnKzH->xbeF;vEn3ri&c6 zeRRxdwcV7N2e+;CPG1yGR~@)U7t#uTi8yL$X=`n16Y}`Wb#a1_?pf|qP&yDl?vEpw zYMRHs0lgCrQHput;YuL|)I?hv-Uk~^LAm5Ag^I86!XhHdR{CKLYn;4{onsN-5H5wH z4Z=dTdAb~z3r(y|nTG{x`?-P#2OB4W7*C(4|Kb8KoT{)-K1?FZ<*D;HVM4&yChN_N z-)z51^lr;E^gl^W;$L^>YehaWQ0Q2_SXb@oPwdvjyKcAWHkyQ`6h{};RqJ**_vK|O zr;&?&XWV1^q`yjKq7xG8oPMh$CaxoA%(v^Zk#QP18ZXxx2xVx^6crTQS;a&@S*+xfzMu|Jwei9TBKfv_5{X^G%Kxy9Bu>_Al})r?!@y@`W`&XteT zKf5pQ+d114$j^F1y^8jHZD%m#sObf8VsG!rs+wUmtD`+T&NR1mo+{V&F=L* zlotAz@7aiWo=)ZMs@S+tM!?ob?e~w(c9C&#tl5+K?BlqM^a*8eN<91LZogHs>Janu z1$+F~1qHLjYlbT0(uHV zcW<1>if%^R`sj{0-d2JR1M%%M@5E{u81GcIyEpO|*(H^LqX^46eEnE!hHr1VTPZ_% z=d|2HIY;3+@}BRf01TBzXUaVD_D!r^r~I{d6IT@#(-lwhl9C1w0fxyZ*WdBm?puAG z_vX6U@&Kvnf^W!0{3lxB&vj&O%d=Q=-MGpGd3UT5U@p~SLx&==F-|~28RU~6+@}}z zzV3Lw5PMxj*8I?Hc^j9QP=%6`aeaxOhrhCitde&A3q<9+ zNA_vA5@wM2F`V9=*_;$TOoQg%4GlCrM=nNZk$Cm?@<{sZp{iba(hO=r50+m|smJJl zK&NO#Tusg+bSa-$vG>1>e+U=Z$aV>XuXB5!8W;Kogh)uX12j$)^3Lv0p?w|{Z?4jm z!Fln^_N1%4ja2}-v#hIHIQ!*Oko1cecp)yfyZa}z6s(u~-ASNnuJuQa{CFirIrbFq z7ZCIyKDb4Cbr&{2QILU9iTG1>i|4gFYQoZ)LCkz4f zCc>$~i~4fi#yjmvBQQVr+T{*!@rt{xY-|Xwo!M}jJlVp{|81I|n2_K{Ldk98eY$Z{ zPFWdDCBW!sYwu_^xoddwyO~ySWB;{xw*EcILrdu;Sro*7h$O&YIJz$WOtO7{jc0#w zG1mmH7v&s$`eB4Jv(C;ayb?57V&cAc83pFH_UMNqe=89F_zHxu?QzWh&mm6f@aG4z zg|_R#^&G`-2c3r62on+#gsrDU-Hvw`b9*O`(u^I*3snp@E8n%`g+(!)ysPtE-n@uP zcFdhgkw_L3VNol2%zK|2rr&eWU#!Ib{X0N-yVaJYpL40eFj0uV!94fAIpICDCL%{Y z;f9!Mx>q}TbXiloe~Tc*WqZia%p~zH8CYQVu=q-A;)}Nl`#3%u<0Wi~BEQ$heWksU z8{DQ%%=kx2dTMxkV(K69^FPPCuFzvMnn5<&M=khPZx2ku2cq*D@oL3e$crSN9k9U1 zAADa6dF?9C^d+(8HW}I(c~d$h3A)l-DHG;B+=^Jz3Q6W?zIYsZx3!ow)1PEufFqg> z7JHs`xE0WobkrW6X`^Gh5Q;o7S3TOcEqqlzSUGzxKjFH%3nz>xtRtQ3>@*u$NPdEW z2%dZp7*e`@HPs_5vTVBDAD+TGEWyoB{e&|nNEUmlHT7Q(dd)_^~GVL9+ zPv#9%XnL@hvAEx*ZxAN_k!yVx`K>}d{Db(%3^Bs;B)De?Og**KrTm6Hj1Wm3cuYGJSQOg5x%*L@ctl!1zLh`PvznQPeWr5YXX$js%sa=lb5u!ZKz`NO5|lGD+p zS4!E4fPwkSjZfj$UszNa7%0uQm5?rNUgLafc`!K36v=KlqVMBKVx({XHGI~(+xvN* zVs^qg&zn_w)xnVX1Wpr6g~E*UUy)1^k(JJyu9JIr z0F%`^zTnB_baSb4ud01H`RWx%#JZ$YHmNw&6Ul{M_?9Vkv*J{)||&$VGyQsgecuqaj_&Bc*h2KDFcJ&{El( z(Q->T@rBgu=!VVRH4=8+2DihhFAWP%kVWyxy8&f0c1N##Cw85Fe{hr)30nm!wYU4Chvb{h0kcln~&eU^@{CB0j9(rKgE zW?gqTefAkw?rX2vqk3TC(}MBno<(*o29mMfy@abp#l+mF4X1nhVQR)^wq|}~BZ1SD z$n0(UXyI>QS6S5VBqOT_OX6xirS`?a_r~{lbgdI%$>t(G<}0Q5Bco$t9M1ZtY1`$- zy;t6Nc$}BKUJnlsZ%6XBMBE*<9Ajc%9Lehp*hQKIQGPg|Oe}F~j`FInO}BQ4%2$bB zoyEzfFWk(5!+>nlv3f@lles!|^^A>1|mn4mVv`%>o|Q>bW8Z8#lL#cV-`LlWZ(|4{$)i0@|B?iJq|cCkaUw zyl`J<8)6RYKWK%mv%cWJ5a2W2Po3F`TZ+>E?i^fGv<0*{+_ZlIk(PG<Q|uXEk*Cd43CE{1F!)_C`(!)ST$W$tllVulJYh5YR zkt3X^t2r~{y^>XTKYU~5(-8ajnRBnMudPpOB;KdFMM~`+_RJzfHSm~TKU6B~I2*ju zxak?dhP}?+VbT-R#9%{XHrWe5IQ4Hn4ZIA%p`EU@D#s33P2fm9)D_>Fk>3%Ld0u8s zbKm*OQh#)%4&pWr@eqxkfx%#NN9B;&)vH%SXklhjeG#3l$N8m$eH~fM7LISH8a%fU zhtp?v%r9cU-lm9lmN^4hOm}%I^Fk)V^r*5x^ZOolz3qnVPeISa_jTX>uLInr^saPd zYRF)GZ9z^7BlKH*o^y3OS@`X+JN90;I}5+`8;qAPUuO2zRkNq425&Uuc-`~1$+Ye@ zvFO^`Tgv3(<2xBhtrW?hzfR05K+W6(_P<#%lV9-LjrjljWN$VbR4hvMFlW;vd)&Ay{EdP1)M2Y{L8CQ5ey+y zalB-~CD!`;JKMH4aKRT7n?DQE!zw#R3zf8IW@-!z>*@_d4jIzEA`MKVjE#m_n}SyU%s^Y{c$CW876>E6B3|XjqY8)TIqAQCg))Xq>!sW2dX1cBUiU>JEyB+6fjMR{pSBuk zY4+KF9_nA-8}me?!SSpmyOg2-nDxA=86Xq87XIq~>r3EzNXbXCYJXQfVlNZ<3Fx`_ z$>3mx!&Nq=oT}j()oKGiI=?CE7!Jb^Op#sPv}3@FboKY66ZvvsF`Xe97NXnW;Rwjq zZ-#!EKcu{EchDOzSK{Wz zjeHbo^ZWSi7{uuB0Z;c0Yj$&3k6~Spm&h~e(Xrj`*a3tPaA5Bw-lr3>V4RnQvf3XT z|5^&>w@K0xs;zfbI(+w3GMYtq5AeX!W%k~LQiTci8owpr#s(#DaB!>wQ*}|kNl2;C zBd;Oe@ez!h=VB5ol%ewIOoN)}E9Qyk0>yZVr!$a*?3;<{IkKM^0iaTvd3gM=A-KGj zx3_bYO1~lEu)hLfwK;rWzPQ&}`xgGb5Jv5_*5eO?L#w3}V6H`%V6BR}6$X5SaizAdCZWXRh0c20zJ*Iq&f`}Lzcc%joL5eR4olTbTx4z? zJ-sSh93iaLz#djX8$kGTwQo4;^Q$AEw@u9*krv$ebRR;Jcs=%?on%9ocVU21 zeHUF_jV~9MuQNt6OcpQuOl5z+@TRnS%@lcQF-=eROvZAmBJoVa)C# zqM0{1J4j{`U7(hGN!R+B{Fac1-f2Vv{ZpL%&7TL9t0HBo?egK9Ix5u$<61L1%j`NT zHT;G%N1j{Dv^d1Kh3Od?SHXH&!B*%HTT?qj>C7W9wmhFyJ{|1{DN?5h0OPWmE*_mT zcK{Cky?}k5KmL}2sBm(f1v0@I2%fB_bE2^OJQ!Kl%A4wZbA?$Np-TY@=G=_(xI7Z^ zwTC3)SB}q&`=efa>DD>%J!SSCL;8%R$Y#rregbwuE9kMmakAGF`*?ZxkZJA6D}U{^u-S;3&U*&!NMz4pT>n1mJ6^F%;9c2WayJk^}Xb=QG+f4s2D zN$kwy4f`P$wB|jMaUxJ_=uZOd<9LZ1IipsV(*YA=vLbXj_RCMCHf4-rUG6#k%vCy- z!pW&$ze8Adxnw2E&BAKJKm;>(d#GrxTh+Hv3xt35l^uElvsSL%N&pd6auBo8ztG6n ze8RCPZRq)$DO$JOsBc(zIho&Hn2zq79?}DZF^*qH_fI;m9c&V-XylTMZISix820_( zTxRdA%(5EHdRI57zE-24XmHx9f4=ffjCN$weLK0EZZvJnLsfP3{fQ?gbJSVMQ_U0d zwAZl4(v9>s(2y}iB1rfV^abq(5>+=X#>OcJh*Jh8hES@m@M_%KBr_y%bf z4KU50g!V7y8oQlBb)lgn;u1*Dc*vCfqs#WAl~gytFCv&TRb5(|7jt=fk}B0s9kue_ zg!Nh=Lsd^8Jt{GB2wK0z`S!HVzx$f-Yv3Vm?(djP_h8%FUQvgjJO+Atr^&wh_{y#7 zx13LY&8J?IyH!_TFJ+GT{xZ=biuiAhM0zM04v3NvI_%p{`XpGbpK&_?8HpQWnUFR{Ork&!ZE z01xRstGO}`=sB|9any#hEm^QC-8$bKgb8M+-Sx#k#*9^M&U%d2NQ4hbR%blV;7cio7IGcgF+fbL0FVYz#x}!9lEeZ=tW;95Ks?xB?#e zQsfcZo4}`(**_^4$Gr^vEdaz|rK>;)I+MuCpTx9Jn+gE+BJ-$Dy&E-^9hcxQkkr7w zwpr%#8a!Xb*=Oec384G2Ip$&jS1xW}G8YFUU$%6=O)~7`X-drO_i814@(u>Shp)x$ zPr43}zLL{5p6vh>&bWIgE6tJ;SoWPW>X|@WV?5Tc(5a$deLCRhza@zjA9pyd3w5ZH zuH;??J*qoIB5Du0EGE^J8u}zzg=edeQjoY#n;fF2qPntrom~f3brS^9;`|O587}_( zq4mq~NDQ7n z>vI3=L>bjFOoBX(Q1d+#ITGh&sbFzKAh5f zxG0#!pfU87#0>ir(}}8&0U`WO>zZSg3biA>3(>cQ-3(S0Zzul9yUa$-IX3sG#@uSG z@#g;9bm=#OQ_~vU774QqTdmS`QWTMQm2xgz7o6Wkc2(C_zs}Zbc(&sMAk2|yti&p$ zvQjmD*VvB)MBXf??>>|!CkDPN@ak%uE;2mEK>T1@OWQ&vKcUYMWa|`g;|WS8Xh2-l zoxKP4M*kwyi;Cw#idH^wp*N=N8&Caj8Kr5Uyvsr0F&9i%y-v4tP0UoW8Jj9mH#DpD zLabSETLyjlRPVZe7-pEcHd4Tb2>JXOX*`jzg?lvMZA^KNnqRedU-PlsTHD)?sGpR` zc86SCIHb^TMIcTHpd^X7p`cWR zj!u6e=IX)Ph}#MO%gga<*k!Tnq#Of7CJ|zCg=!nn6F|?LE=qjMN>|&(n!Tws>kPE% zo`ts&mYQ_aAub&^-%ph1h`62KJ)i%*vZ8iEcIq|#6mMsDvHR&I)z*OXN=vdN`N@%W zW6m5RQfRl6Gd))U$56Ef-OI2;T^?*@(!G*vP)eHB~$cVw?TOxNdRV)1bo+UQlxIyZEKpDlHIhv>YoKJxA zIHE63&~2h6Ec$7|!3IxRM4N7Hy5mH{g|Sp*mTv85lDojjc!f7vX?jg920Fe*VKx|^b(KD8--^F5=|!(ZZA!=Al$ z;mz8?*AJWZ+c%4_D83{X3nUeNe36^jO|D6BgEahgHoKJvCkf`)dm}!u28rY`Sk2bc(zq?_M6*x0R@IWRgciDn6BE<$(fd{1 zq`F~u@IklJydkb}N6Aabb|uR@_BKxc_l-;-YK?K0Y#sCbvhp&?`W4_8_uB@>e&kC(NyX8msV+OYn>(z94m4Kz^se6qdqAw?i(U?*|0uZ zYI4w+Yf6kyd4`tASRj$%8NIQ5&S7Qsmhc)HZ2^x_n5x>bFMw=(I?OKk*QDDVRW{4P z$z8OO_uMP+jinvexgN2ci-rC0H<6`LDsk~)Rn=^;?7Z~6m`-#*+B;c541ep*%#Z8p zFHnc24+J;@T}uxDVBHhXH^2On)3jH6VQEPt(**v`v*EGgSH)x$sB6weBqZcH5ORyT zm5zqnVn?0g2YxKmkT1*E-;h%McCD?zfehbv>>WW{-xzfWSG%=+KFZqoF#g|X+9~TL z@(&nGwt=~Z6oYO7(+p+Z;4Bfxn-_2?ExXRo)^Td;Jcb>P0YQWBZ;e+ns+dxOdn$Sc zeDl2Q%I%^KsO!{EO?$(^0*+y55GWd{RLN+|zwa|Dx*J3Y?qre(E&rmNrsEslWQ1y+ zl+P`ckeVWm^x@(|GnI@oOB+$C2cUJ)r4=!ibDr)D{>K>>BOazwatGs)+6Cas9ag*& z@rQr?mPx+mWg5!mB`TGg`ji6n^R}4EOL{pR%8;0)HZ6V!49LgxPDjK1BI%uos#jJM zrN0Gqo2HQ3H&_9TVZYD2kd|PA_cuJ#1cuxKucV6r%pp?SJjh|?c7)tO)Z`u}qw?mxz%d!T&1qNg*dPV&qBfN4M9-NbH zAwK7H4a@+w7pYqiJWnbr^NTho1LogNGCPwogzsv@UE$8d;_k(4m1U>bA{8dV+ruODH@Kn5b zD7K=JQMraVn8?rfIxy*rU9x1S6Yqdwe7Qpq6BMffDoilM$X}2uiRU-HzOLv7gf<4u zjGBzpHS2 zr6~4vU>=5K;X_5zfYplsGyy%Md(Sc_x}Ry{$WQVSbcO8I{|;$sYiT*8*}edQS=EAR zqt(ylBRLdBxD%alz3z8^nZ5))p3Gc`uV3m-xZ`(oXDAJr3s+NR<^74E$>)TZBZb>% zlTMomxXns%P`BqSbIQd#;2Q>k_)udo7mxMK=AgZg_rQ z*2mub|DX|rPTeL!rhV!zJBlBjLS!p(%KE_e6-X5T<3dO&Z#R4XHAYh{ z0=7<%@Cm$&057L|CR(jAsB{nK2r;#zgdnEksM>OFZXbSg{8B}}pKjKx_bax~dl$`{ zkSL=Za2=#Z^+@YHFo6;M@ifnS4QNI{Vu}GU9${?zHW?}Qmya$IgPVh4n{9l555GOm zaGAUuVqrn6>?NabX$1tO5s8tDx($3PJ?)HSA%I>Gyl3&*UyNY|ZUpn4e-wA+J@YnK zac{+%01&kVYQBnS5G=Vc>*zm%OgcVAcpVz6j94;0pm>r}O^)^!ePkc;iskP81wj*U zSRp+t{p(@Rli<3(QC?V(j~Gz1A_0YHe##-w0F z9jJ4{sB>31OCV*`eYQs6VFe-jHyDi^00G8AoM-NT@gaZ?{YEgM!{=)}p{Y4upW)*; zP}W~!PiW(o-GY2>gVVhD2o?+5R<>U(P!>VA z4TkL<#@_BV7SZlsT2Kl;c&*PS^^cfDZ0I$Zn?}kObx46ig8-TnxyDV2cS;Bp#}}KA3}rYE#;l*{83v?=nixI(iPF+j zv?&iY2LpRSG4WPKr24Bd#vR0ep$w~XVARiRh#-g_tfddv8&mbfzq&QFp-_Gn7&fG0 zV5<{F>5Czn16Bc&`Xz@Cq1!A%QAppjPo+>OK6phD85sT}0uUCkBmkyJ<{9wpHJ>G5 z`#7T-MGS_N;*(=R6(2yQR)w5E-(xNU2w*srg_Vc{rTBugIby$Apf|w!_;it&Vu%8p z1nLRf}B` z0K|N#AIBIdO<^x8$yiD16-IH`)15e8e|?uG^a!} zEQpuH9s`jD;$lW7!KU5Be<#&<>4nF|)RRwu6b$&<#^Kmu)u+@Bmx3 zPPEU?C5I=D^Le5G9- zI}l?o2FR4p@KA8+^9&PeC+fS<>Syrm3ub;o=;3>CR0Q2i#8E7JFpXKn3#7Qw0EpkT z%^eZxC~Q4-)cgEz$rLjpFi?CGFs^?tiM><*26iEBN-Tr}4F;ncm7GvlVQ4Lh!0>ru z2g+6-G^4(rdomUuwtWKifgMg;~GgC39sqvLI$ zcO58rn3Mpjcn@`?DG82zQAds`Ly!*<7^^53N+5zj&lPTgr(d8hz6S3@DQb#}!)=HN zH7W%ja4i7U*DbG>P&_49zqtS3;|5kqBz8v^mhKDhfw6^)DfzBKcEh z`_(Os6cmU7=$0$UGla%yP18XL=??X+SpUcpmZ35qHxx42vhedL=$Vr%u%i~_x>>tA z=s_fEZm1~Uk}Ghb&JL_PpaxlvGG(H_S-zO%%qbpTLT zpT%3TiQuXf&bW0w?uj|3Cl|;A1@jogSm^01j(oO69;(*bA?T%5^U9@)!Q2(GN>z{p zPg4w!<5%X(t5-i|Y(xjqh;Z%hgkuSeOjWtoO7~+RB?wfRbm=eFQMgF>O4mY6T>M8{ zufW;SW>mL;DM+%(-sVy(^$Hx6z<{=CPpOOd0V&LExZ{50*4o+o_ zrNKNVJK4e2t)slu8}ANRnu}RlCXoDSZq{~NS&e-_Pd}o_?j?L&ldjX{Frzb0##hLy z_fIZu%1diHbJE;JNL^t7tpt55U~_-joP^=x$b-B=DwIUupMrL6MS@8 z{3q}v4U>}^0CJD>_?+NnK)Upz-+&II)X(P!>L>tpwCjR<6K#bdl#k`3j#8vCk_;1! zAruA_@yYaJW}swVFEAz!dj?gk?}IQfCx3*Nby-7m@)uW8l1m2E2OW|ZrM@QD%k=u1 z9s#PQtgBWyC`SiTATH&*Zw;-%5sAmp9J7|sf`Mg8)2<4YSzzQT9FIr`f`)Cme zEad^e9csk<($fvhR^F^1h#~-v0~PTA^y=yU!G`MLANmE}bOY6-d+O|U=@6C;^bdg+ zejM8?c_?l0G@!;B6GA;~i#Ih$euy4=>KwOT5hqH2s0frwizv}YMBs=Ree{=7se%Bi zPr-rN>L8#1nwM-JW>H4LITRD;&xCq?Bc)8QxHf92bF3LA-|nKGOhMg`iMs!0<1mXQ zE$S%kW48JjdW4TYfFl-EyDmH#Ci-ZEY`ibi`(p$JwLZ5%%Lp`I-h0GF0Q_yBYS}w| z+2bCr_4%6DsQ}9ct=#~9BS#;7a*B$qOuwpO$2u5UItXrt;GB=y*Fl7)uhb_UMIs@V zm)_)2uIg!|sF(ihh70Mzt&i!FDCR z1&bAhVph=VU6#9wGb73V54!K7DPiO7s$^Cc6wpC=0W1UrDBy&02m4D})KN|WaPTQv zON;POZ&!d=OJATTRpFWy&I^JEzLu3UzM$9+ME$# z1Cb*85R5_G(?4V@-wn*s;(&>dveIs!&Oe9T50L}EGoK54+3B^%0oTcSm9G)@W?pKs zd&OfTjiXt!e6HaR5N#~RZu4bTYY0baOzZy=zI+@cyIFdS?ZpoN#e3afJBU98F~xr- zYmNlK$yxP}Qjd}79KrN8F#PRDLPN7T_*A~1g@dD*zx{t8ZhU7abL-@;)TJ)&#;OZ- z6+S3V@)%7|8ufA6Zq}X{&W;tU4YjJ49UT+R)D@>1dkts?`Fc+SkOQ&z%}r~Nmx>u< zQ5WSQU5LOy)Vu*-}v_Xyv19SAdao! zdqI>s?qvtJzJB)VBX=i;8)mmD%{3%98vM#aMPD(lGirZ5s|6So8=pB@fvhg>aOS2|7Rq%$RMaK zFsdMq%N10wVBjkg1J*~I+w%i2b78?K-JWtF04s-lgeR`0pzJp^{Ei!I1+9v3JZg#-bDU6=#`|=I8q)zc z0pQm+V{#M>Ldu_`)X~bTv0B;l;~V<0rc+&ICX}BrCZ*gR4GL}(N~IGgkt`OoD70ww zT47Kv$)o-+91kj@Jq^5Bl=tq>rP}X_T8<+e3R!p4DA3#mDk_TD)MvJOBx%8eVs}um zf3|up8YZ0NE^+jX)XzjwT1q4)(hZm`S}Upiy_{LpRnR1<@!d^J&O3cMz@Y`PFscj_ z1Qz<}?hJ+rF1lxL2TahU37Y%)h7(PRAX|sBTPPP8&{4dJYsw&e2SM97Ryb>DKyV4< z#v(({$jbL#Q8H8`5YA5%y)*RDhXE74-{_+@aIO@6v?Ut<&I%0^b@5HySikZ>AQe26 zVPHOjX4Oznqd?;yO4MlCHhFD#@u0#@A{hPN(A%e0I7N^DOojq8W4Dxjv0XuArpe2i zRdkTr=gGik9NhX*iZgdT+Cs2AMftBoWT^f_IVk=PoVvfS0ROdamRbAHujc~Q<}j=; zCgb~!yC4z_MMVsH!aP7c@BK&#dajFZWSFp`Tmj%FgDAvq1(JnDeMAx6`Bmw&;Hx)E z@+45p59PgMu|RnbAowu=fN;DFEk|m^tfl1W4iqtIn6RSQACv-3Ttl;YDCHVBLXR=! zCOATiR?4Fi%QbY15t7}OXf_U6VzQQ^nL9LwuVK>r-6#|>;vQN^|5KHe*I~g0aRD%= z9@pbita>8Sf=9i^ARmd_um`#V9o+4X>rij9@Bb&BH*qEzl6?E*xbq#gh4ob3bow%M z`Q0B>kr|GFaX>lb{A@1-U{aH(^|(ZROBw_>ZrtFuo*9QZSAxwISj>?YgIP^+2bw+n z;O0i6pO4SvxG9XbW{5+A0*9^~OA~WpNEGV8%+QS;zKc@-9bw7(=pi?^9Nzr=Je$z| ze$Ne3Qi~A`v$y{<*ZY&cVAVHlH$Zu@v{LF$;4sR~vwQcj%eowBZm8qGhxIsfKjEfH zFa{-~_#;@X&k3XCkDUG^D~HYdCzeHigTGJSl&xa*7jwn6+l;>MWZwm^O zL7~bX{0i97B0V8uV5@6WNX2#AYqz)Y`qPFB4R5o2XBzO7USU;ie$W4=1B+s>AhUSr zkV>m#m1aB=r|TSV0YdRmqX2*+{yQj}>!a~E6j7y(`=BMx06@{Hb9uQ9T0q@XlC==R z-8WET`^T;Mn@ly*tgsvC9d+ME%io3{imbPbe)~Ka{fB5Y8YtETTor%j=U)Uly452+b1 zDY=}aYKTU(&{IPh85%QIew0Ft`I%-GX0{RHi%WRMxUfc;XbAQ&J$bvf!69;u|E-?A ze@VJr0>6dFWnC;ONEtd=&PsXt6EvHDa;JFK`TSQi^7{+D%0k?LnQ8_<+rAi%C&yX) z?8n18&ODVNw?~S9WXYl2B3CO0h5BHF&>)}5?w47d!;>8I zMf^=`Yu0#Nc$Uiev6m-^k&8RC-x%M!x-9cuk8wUK|FFoI-wH>w0;jpJlY;cBg zRUX@PZ>$lYol=+PE^H9XPUds|7Ix95O*vh(w&(k+{_zja=rYN!5>P3a%QZD~^@YCMq<70jD8&w zyT#KX(|*&7iWQZW=|ih{=`#T!3Ba49d$vw4yTqn;I;Xz?>V*I|$J%tx4dCnb)yw0B7Pd3bqVog}xT*|O-^szyzf?{V! z--#!eWzz62GB*dr8;?s6A_KfRFgjnS!aNb+%lU%3{yg!B^^_afjpk!lo_1hB!JiJL;)Vbteg0%o z3cOBFUT5xz25`0ap%ffaOrzDbGBy9RXof&BS6kF6J4?{wRTocaf>vKh#=zo_?m|lA z%74_g@mfXrAB}$$62@ZD>)4Jqnvddl;70kV7%qsJg_h>kp;F8H{ruQy9CLQ$fCH=R<;qdEIikdeA%wz2OsG399~{s6$=P$|06)e zLXXbhpb5x7$JhSNIr=}o1Zt3G>Kto5GiTy<9nHPQ?1n0)PjMYKIT}?-A6_u4S2r9m z15d%H*G^@vw*27Ah+xT-?WuU`e4^15~+ytj}ef$nwicP;EMpA}9O1eZB0iS`-|)4(@;g73@y(LAkD< zRyA^ed`}(5T0`bm={dPHhy*28sG_s~A!dxPf_g*z_k}N(7yNKsugf7&dI4%X$}Q(* zoOyVFiW~K-e6!59uE+V{F9Npy{>`}|6Nx0|G7B)-5{0F zzgDIl^f!2pHkeomSagk^WVJ2}9MP0{VSKKoCI7~#n+Y5zDGGzvpfC#Ts+8UN>=3rVILD>j+%du7Pb%WEAn zi-m)W>dJ4J%EsCE=)p1S`5IH(jbPgW>DA>4a|(V*;u8Gb{hr6pr&WUwp3rOpo5$6+ z6@K(9og_W5BnK2`*H+hz@Z(Ff3SR=C$iuL z2H5C9$+NXFTdN87s)0H78s&DxqlGRC=o4JhOYA+6mR*xZ7f!wTJlod&|a3-VpR-Rodt2X(w#DPdL` zUK?A>qNDr8upaZ#jB5F$DU^hOk5AAuFl>!GzuAjxrBq=9VaM~HYJy0%F1H4-hA zzLj|-pK`dux;F>ZzAM+<#cuhx%sN^A3k)#g&-ZkXsMfw`Uf5gp5>Sh&=mTh~$Gzrk zZ%=)2Q7JWQAhgVOC0W?L!M^~=mKQWiaxcq@g(968Ilqb-J@=}O%nYByc>@F-`L#v zMMl9i^JNaL|M=@ajqv3Xe))eEhV9Z^KISS&D(|m+WYL}2J;^NTbfRxp(lNL2 z@G`c}T|{udubxe}Y|qvi{I{4{Xogj8ZVZ(M4mzr9{v(Y0A6fY&h0?ED`wK!ub_1kD z9@F*-fl0{zaxQiUqMcr&_^)5T;t)|%pDe5bKF4Wn_o{{}_cRZ|@U2NQ-Fx<6UA@#T z#aHX6KM5%Bj;+;O1>1>1tplq@O^G$ad<%KAHguN329*C2MCW}83Iv*6tiMN8?b-G1 ze~b`EMA{D6_XH-{4zDNqYVBV`YH99?FN4yq1UdG6{J%@h&Wrxeg(WJ@*3#s3a@Y zs-OQ;Nf!3>%-zl8RZZ5u@ismwDe2ktHb?Ctb=TVJ3D^gk?0x#l4m*Rw7t1*`a7Ms8 ze{4sBJ-t}qhhzU7&{v?f{pD~rhhQ4C>!65=G5<@x>i?}h{eOVSI1cMVZ(8KTSsZUd zcZ7JEwL$bzN5)cX2jB6UFuXxFFj>;ebkS|_I|wQ^Y?0mMNJ}2{2NwD5pYS?Jsd=BD zjDlLa|6|^w4hw_~T%sU~0pq%4ybnY!6Ty?FqI_q@|^01W{h2okD%QC4-jyXm=5O zeFC6*_=7-6uyYW6W8$DI%o9}l^Dk2KIyn3o9tQ%2SHhFM;k37kgZw~knIYAX2`wdRYv?MF5`#Ve|GE&cak1cbu3Z+kd^4Hd$!_Rd>E59v4t zeo9*9kgL7Dtbh|N%jneWcgGRNE4jRF4hTwove(jCnj6(KU4fJ*CO%4ZCtC5g1T%eP z(_k_Ey>s>6WK>`B&RwR{uJU!*xnLcrrB4(ni~P(2KEgp_aJqrbJZhOBY_?yWD$stu zJ?)nbL)IH-p7Vp#EUyIhW=CL4au4btUinxf9GP0WLxcCn|~7*P2)khVA6IbzPc*aqy3G~?Xu zbgDH|K3RI}RuwG`*zhjDoSNo^^%7tM*xN_gP}S7DIYST_62hK+HT2N5C+6ERuEc>z z1^Cnu_+EwCK*2ir9Ex(ba)XARgz9L|X@EBD)@@3PBZ(tn@sA5eVE`Wz63%s+&UY~$ zNMGrg1U2I5}+S)x{Z6&0kUo>RMU%uARIRbXZgZ&|(fLh39VXVc>{zEWTr33QIcZr!%J{CsC zmw3fi7s;pcorQ*q`A~`%(0p61>%_GF{*n&s!@}MTb$xw(AYTSzMJ!%XNBB$5Z{PBH zua@jT;^1K9U5IPC2R>*3w)~t~B^OrYYsN7+7reH(Bu6FCq~~A%Qf(Q9bL}-?U8XtI z>`}0FPcF0JEO#P$9azu>i*r?WUie_+d=jX$@bGms z(U3|1Y?Z5|gG0|n(MPZ+sxPKg!7I_8!gskx_XZi6=SE8`AB=rN*>su6>Z#4FYzQggNrf}ei1q%<<@xznl`3%X=@ej12Kayr z+6%T!oDtJFt{+l^PnaMkpV-W1RFiyH3aGfi3CUBae+Bm24en}!4J5s*k%nyv{$P^} zK!REz*0cL7EHJrvdEo|Ijyi2) zQ*!T}etvK5oM(;Tqdw~Akr<6QsI2$uW=C;2z!##^A{_(8UsKr9&G%@YoMa_|+lLnar+zY)j+kxzlBx*C5rT~5!e6Y bOBb}w9_0f@gzulAUh`5y?pcAj-uwRr+*T%z diff --git a/qwen3_moe_mast_20steps_loss_curve.svg b/qwen3_moe_mast_20steps_loss_curve.svg deleted file mode 100644 index 7fc6c0ca..00000000 --- a/qwen3_moe_mast_20steps_loss_curve.svg +++ /dev/null @@ -1,68 +0,0 @@ - - - -Qwen3 MoE 30B-A3B MAST Training Loss - -9.5 - -10.0 - -10.5 - -11.0 - -11.5 - -12.0 - -12.5 - -1 - -2 - -4 - -6 - -8 - -10 - -12 - -14 - -16 - -18 - -20 - - -Training step -Loss - -step 1: 12.37845 -step 2: 12.36325 -step 3: 12.33137 -step 4: 12.28397 -step 5: 12.22048 -step 6: 12.14017 -step 7: 12.04897 -step 8: 11.94193 -step 9: 11.81908 -step 10: 11.68259 -step 11: 11.53297 -step 12: 11.37303 -step 13: 11.19815 -step 14: 11.02700 -step 15: 10.81583 -step 16: 10.61479 -step 17: 10.38304 -step 18: 10.15753 -step 19: 9.92291 -step 20: 9.66127 -12.37845 -9.66127 - \ No newline at end of file diff --git a/qwen3_moe_mast_20steps_losses.csv b/qwen3_moe_mast_20steps_losses.csv deleted file mode 100644 index cf58cdd8..00000000 --- a/qwen3_moe_mast_20steps_losses.csv +++ /dev/null @@ -1,21 +0,0 @@ -step,loss -1,12.37845 -2,12.36325 -3,12.33137 -4,12.28397 -5,12.22048 -6,12.14017 -7,12.04897 -8,11.94193 -9,11.81908 -10,11.68259 -11,11.53297 -12,11.37303 -13,11.19815 -14,11.02700 -15,10.81583 -16,10.61479 -17,10.38304 -18,10.15753 -19,9.92291 -20,9.66127 From a7ea958fab5b01772bbe6f098ce50ddf4dc856a2 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Fri, 5 Jun 2026 20:35:55 -0700 Subject: [PATCH 23/27] Add LP-relaxation solver, optimality-check gap logging, step-time profiling Adds solver="lp" (use the empirically-integral LP relaxation directly, skipping branch-and-bound) and an optimality_check option that solves the LP lower bound and logs the certified gap of the achieved objective. The sanity script now reports steady-state per-step latency. Tests cover the LP solver matching the ILP optimum and the gap logging. Authored with Claude. --- autoparallel/api.py | 119 ++++++++++++++++++++++----- autoparallel/approximate_sharding.py | 109 ++++++++++++++++-------- examples/_sanity_llama3.py | 20 +++-- tests/test_approximate_sharding.py | 39 +++++++++ 4 files changed, 225 insertions(+), 62 deletions(-) diff --git a/autoparallel/api.py b/autoparallel/api.py index f602a967..ef664dc4 100644 --- a/autoparallel/api.py +++ b/autoparallel/api.py @@ -194,6 +194,11 @@ class AutoParallel: The meta model is moved to a fake device based on mesh.device_type. """ + # Selectable solvers. "ilp": exact PuLP/CBC. "approx": heuristic TRW-S + # (light build, no PuLP). "lp": LP relaxation used directly as the solve + # (empirically integral for this problem, so much cheaper than CBC). + SOLVER_CHOICES = ("ilp", "approx", "lp") + def __init__( self, model, @@ -207,11 +212,16 @@ def __init__( solver: str = "ilp", ): self.stack = ExitStack() + # The solver chosen here decides how the optimizer is built: "ilp"/"lp" + # build the full PuLP problem (CBC exact solve / LP relaxation solve); # "approx" builds a lighter optimizer (no PuLP variables/constraints), - # which is much faster to construct; optimize_placement(solver="approx") - # then solves it heuristically. "ilp" builds the full PuLP problem. - if solver not in ("ilp", "approx"): - raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'") + # much faster to construct, solved heuristically. optimize_placement( + # solver=...) may override the solve as long as it is compatible with + # this build. + if solver not in self.SOLVER_CHOICES: + raise ValueError( + f"Unknown solver={solver!r}; expected one of {self.SOLVER_CHOICES}" + ) self.solver = solver self.fake_mode = ( FakeTensorMode() @@ -289,7 +299,7 @@ def __enter__(self): self.mesh, force_grad_reduce_in_higher_precision, repeated_subgraphs=self.repeated_subgraphs, - build_pulp=self.solver != "approx", + build_pulp=self.solver in ("ilp", "lp"), ) self.sharding_optimizer = sharding_optimizer @@ -526,39 +536,73 @@ def propagate_annotations(self, verbose=True, aggressive=False, method="fix"): ) return self.propagation_result - def optimize_placement(self, verbose=True, solver=None, approximate_options=None): + def optimize_placement( + self, + verbose=True, + solver=None, + approximate_options=None, + optimality_check=False, + ): """Solve for the optimal placement. - solver="ilp" uses the exact PuLP/CBC solver. solver="approx" uses the - heuristic ApproximateShardingSolver, which trades a small objective gap - for a much faster solve. approximate_options is forwarded as kwargs to - the approximate solver (e.g. candidate_limit, max_sweeps). Defaults to the - solver chosen at AutoParallel construction; note an optimizer built with - solver="approx" has no PuLP problem and cannot run the ILP. + solver selects how the placement is solved (defaults to the solver chosen + at AutoParallel construction): + - "ilp": exact PuLP/CBC solve. + - "approx": heuristic TRW-S ApproximateShardingSolver — trades a small + objective gap for a much faster solve. + - "lp": solve the LP relaxation and use it directly. This problem is + empirically integral, so the relaxation optimum equals the ILP optimum + while skipping branch-and-bound; raises if it comes out fractional. + approximate_options is forwarded as kwargs to the approximate solver + (e.g. candidate_limit, max_sweeps). The requested solver must be + compatible with how the optimizer was built: "ilp"/"lp" need a PuLP + problem (build with solver="ilp" or "lp"). + + optimality_check: after solving, solve the LP relaxation as a lower bound + and log the certified gap of the achieved objective from the optimum. + Requires a PuLP problem (i.e. an "ilp"/"lp" build). """ self._assert_entered() if solver is None: solver = self.solver + opt = self.sharding_optimizer if solver in ("approx", "approximate"): from .approximate_sharding import ApproximateShardingSolver - approx = ApproximateShardingSolver( - self.sharding_optimizer, **(approximate_options or {}) - ) + approx = ApproximateShardingSolver(opt, **(approximate_options or {})) self.sharding_placement = approx.get_solution(verbose=verbose) elif solver == "ilp": - if self.sharding_optimizer.prob is None: + if opt.prob is None: raise RuntimeError( "solver='ilp' requires a PuLP problem, but this AutoParallel " - "was constructed with solver='approx' (no PuLP built). " + "was constructed without one (e.g. solver='approx'). " "Construct with solver='ilp' to use the exact solver." ) - self.sharding_placement = self.sharding_optimizer.get_solution( - verbose=False - ) + self.sharding_placement = opt.get_solution(verbose=False) + elif solver in ("lp", "lp_relax", "lp_relaxation"): + if opt.prob is None: + raise RuntimeError( + "solver='lp' requires a PuLP problem, but this AutoParallel " + "was constructed without one (e.g. solver='approx'). " + "Construct with solver='lp' or 'ilp' to use the LP solver." + ) + opt._set_objective() + res = opt.solve_lp_relaxation(verbose=verbose, extract=True) + if res["solution"] is None: + raise RuntimeError( + "solver='lp' requires an integral LP relaxation, but it came " + f"out fractional ({res['n_fractional']}/{res['n_vars']} " + "variables). Use solver='ilp' for an exact integral solve." + ) + self.sharding_placement = res["solution"] else: - raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'") + raise ValueError( + f"Unknown solver={solver!r}; expected one of {self.SOLVER_CHOICES}" + ) + + if optimality_check: + self._log_optimality_check(solver, verbose=verbose) if verbose: logger.info(self.sharding_optimizer.get_log(verbose=True)) @@ -589,6 +633,39 @@ def optimize_placement(self, verbose=True, solver=None, approximate_options=None return self.sharding_placement + def _log_optimality_check(self, solver, verbose=False): + """Solve the LP relaxation as a lower bound and log the certified gap of + the achieved objective from the optimum. Needs a PuLP problem.""" + import pulp + + opt = self.sharding_optimizer + if opt.prob is None: + logger.warning( + "optimality_check skipped: solver=%r build has no PuLP problem; " + "construct with solver='ilp' or 'lp' to enable it.", + self.solver, + ) + return + achieved = opt._safe_float(pulp.value(opt.prob.objective)) + lb_res = opt.get_lower_bound(verbose=verbose) + lb = lb_res.objective + if not lb or lb <= 0 or achieved is None: + logger.warning( + "optimality_check inconclusive: lower_bound=%s achieved=%s", + lb, + achieved, + ) + return + gap = (achieved - lb) / lb + logger.info( + "optimality check (solver=%s): objective=%.4f LP lower bound=%.4f " + "=> within %.2f%% of optimum (certified)", + solver, + achieved, + lb, + gap * 100, + ) + def _apply_placement_common(self, sharding_placement): t0 = time.perf_counter() self._assert_entered() diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py index 5361a8a0..ec44d7df 100644 --- a/autoparallel/approximate_sharding.py +++ b/autoparallel/approximate_sharding.py @@ -182,8 +182,11 @@ def _solve(self, verbose: bool = False): logger.info( "approx build: problem=%.2fs %s factors=%.2fs groups=%d " "cost_bearing=%d edges=%d max_domain=%d", - t_bp - t0, getattr(self, "_build_times", {}), t_bf - t_bp, - len(self.groups), len(self.cost_bearing), + t_bp - t0, + getattr(self, "_build_times", {}), + t_bf - t_bp, + len(self.groups), + len(self.cost_bearing), sum(len(v) for v in self.input_edges.values()), max((g.domain for g in self.groups), default=0), ) @@ -196,12 +199,13 @@ def _solve(self, verbose: bool = False): t_bp0 = time.perf_counter() self._belief_propagation(deadline) if verbose: - logger.info("approx phase: trws iter=%s delta=%.4g in %.2fs; " - "decode energy=%.1f", - getattr(self, "_bp_last_iter", None), - getattr(self, "_bp_last_delta", float("nan")), - time.perf_counter() - t_bp0, - self._fast_total_energy()) + logger.info( + "approx phase: trws iter=%s delta=%.4g in %.2fs; " "decode energy=%.1f", + getattr(self, "_bp_last_iter", None), + getattr(self, "_bp_last_delta", float("nan")), + time.perf_counter() - t_bp0, + self._fast_total_energy(), + ) self._memory_repair() self._coordinate_descent(deadline) if verbose: @@ -314,7 +318,9 @@ def _build_problem(self): for v in self.cost_bearing: node = opt.nodes[v] self.allowed_out[v] = [ - o for o in self.allowed_out[v] if not self._out_fully_forbidden(v, node, o) + o + for o in self.allowed_out[v] + if not self._out_fully_forbidden(v, node, o) ] t_forbid = time.perf_counter() @@ -375,9 +381,9 @@ def _parse_constraints(self): if pos_key is not None and neg_key is not None: break if pos_key is not None and neg_key is not None: - authoritative.setdefault( - (neg_key[0], neg_key[1]), set() - ).add(pos_key[0]) + authoritative.setdefault((neg_key[0], neg_key[1]), set()).add( + pos_key[0] + ) continue if name.startswith(self._SKIP_PREFIXES): continue @@ -410,8 +416,11 @@ def _parse_constraints(self): oa, ob = {k[2] for k in neg}, {k[2] for k in pos} if len(na) == 1 and len(nb) == 1 and len(oa) == 1 and len(ob) == 1: paired_edges.append( - (next(iter(na)), next(iter(nb)), - frozenset({(next(iter(oa)), next(iter(ob)))})) + ( + next(iter(na)), + next(iter(nb)), + frozenset({(next(iter(oa)), next(iter(ob)))}), + ) ) # method="fix" axis pins leave no PuLP row to parse above, so replay the # log to recover them (constraint-method pins are also picked up here, @@ -775,8 +784,11 @@ def candidates(m, assign): if nb in assign and nb in member_set: allowed = allow[(nb, m)].get(assign[nb], set()) cand = allowed if cand is None else (cand & allowed) - cand = set(self.allowed_out.get(m, [])) if cand is None else ( - cand & set(self.allowed_out.get(m, []))) + cand = ( + set(self.allowed_out.get(m, [])) + if cand is None + else (cand & set(self.allowed_out.get(m, []))) + ) return cand def dfs(i, assign): @@ -797,7 +809,8 @@ def dfs(i, assign): if len(results) >= limit: logger.warning( "Approximate solver: group of %d nodes hit group_domain_limit=%d.", - len(members), limit, + len(members), + limit, ) return results @@ -873,8 +886,9 @@ def _build_memory_info(self): for v in param_idxs: r = ratios[v] mn = min(r.values()) - self.allowed_out[v] = [o for o in self.allowed_out[v] - if r[o] <= mn + 1e-12] + self.allowed_out[v] = [ + o for o in self.allowed_out[v] if r[o] <= mn + 1e-12 + ] self._memory = { "param_idxs": param_idxs, "ratios": ratios, @@ -1140,8 +1154,11 @@ def _coordinate_descent(self, deadline): def _star_block_search(self, deadline): ranked = sorted( - ((len(self.nbrs[g]), g) for g in range(len(self.groups)) - if len(self.nbrs[g]) >= 2 and self.groups[g].domain > 1), + ( + (len(self.nbrs[g]), g) + for g in range(len(self.groups)) + if len(self.nbrs[g]) >= 2 and self.groups[g].domain > 1 + ), reverse=True, ) for _ in range(self.star_passes): @@ -1213,34 +1230,48 @@ def _block_energy(self, gids): def _current_memory(self): if self._memory is None: return 0.0 - return sum(self._memory["ratios"][v][self.cur_out[v]] - for v in self._memory["param_idxs"]) + return sum( + self._memory["ratios"][v][self.cur_out[v]] + for v in self._memory["param_idxs"] + ) def _memory_ok_after(self, gid, ci): if self._memory is None or self._memory.get("tight"): return True ratios = self._memory["ratios"] choice = self.groups[gid].choices[ci] - delta = sum(ratios[m][o] - ratios[m][self.cur_out[m]] - for m, o in choice.items() if m in ratios) + delta = sum( + ratios[m][o] - ratios[m][self.cur_out[m]] + for m, o in choice.items() + if m in ratios + ) mem = self._current_memory() + delta - return (self._memory["budget_low"] - 1e-6 <= mem - <= self._memory["budget_high"] + 1e-6) + return ( + self._memory["budget_low"] - 1e-6 + <= mem + <= self._memory["budget_high"] + 1e-6 + ) def _block_memory_ok(self): if self._memory is None or self._memory.get("tight"): return True mem = self._current_memory() - return (self._memory["budget_low"] - 1e-6 <= mem - <= self._memory["budget_high"] + 1e-6) + return ( + self._memory["budget_low"] - 1e-6 + <= mem + <= self._memory["budget_high"] + 1e-6 + ) def _memory_repair(self): if self._memory is None or self._memory.get("tight"): return low, high = self._memory["budget_low"], self._memory["budget_high"] ratios = self._memory["ratios"] - param_groups = {self.node_to_group[v] for v in self._memory["param_idxs"] - if v in self.node_to_group} + param_groups = { + self.node_to_group[v] + for v in self._memory["param_idxs"] + if v in self.node_to_group + } for _ in range(2 * max(1, len(param_groups))): mem = self._current_memory() if low - 1e-6 <= mem <= high + 1e-6: @@ -1254,8 +1285,11 @@ def _memory_repair(self): if ci == group.current: continue choice = group.choices[ci] - dmem = sum(ratios[m][choice[m]] - ratios[m][self.cur_out[m]] - for m in choice if m in ratios) + dmem = sum( + ratios[m][choice[m]] - ratios[m][self.cur_out[m]] + for m in choice + if m in ratios + ) if (dmem < -1e-9) != over and abs(dmem) > 1e-9: continue if abs(dmem) <= 1e-9: @@ -1264,8 +1298,13 @@ def _memory_repair(self): if best is None or score < best[0]: best = (score, gid, ci) if best is None: - logger.warning("Approximate solver: memory repair stuck at %.4f " - "(budget=[%.4f,%.4f]).", mem, low, high) + logger.warning( + "Approximate solver: memory repair stuck at %.4f " + "(budget=[%.4f,%.4f]).", + mem, + low, + high, + ) return self._set_group(best[1], best[2]) diff --git a/examples/_sanity_llama3.py b/examples/_sanity_llama3.py index 71fc1122..6a44b386 100644 --- a/examples/_sanity_llama3.py +++ b/examples/_sanity_llama3.py @@ -179,28 +179,36 @@ def input_fn(): try: losses = [] + step_times = [] for step in range(args.train_steps): + torch.cuda.synchronize(device) + t_step = time.perf_counter() optimizer.zero_grad(set_to_none=True) step_loss = torch.zeros((), device=device) for mi, ml in zip(input_mbs, label_mbs): logits = parallel_mod(mi) - if torch.any(torch.isnan(logits)): - raise RuntimeError("NaN in forward output") loss = vocab_parallel_cross_entropy( logits, ml, vocab_size=model_args.vocab_size, tp_group=tp_group, tp_rank=tp_rank, tp_degree=tp_degree, normalizer=normalizer) - if torch.any(torch.isnan(loss)): - raise RuntimeError("NaN in loss") loss.backward() step_loss = step_loss + loss.detach() torch.nn.utils.clip_grad_norm_(parallel_mod.parameters(), args.max_grad_norm) optimizer.step() + torch.cuda.synchronize(device) + step_times.append(time.perf_counter() - t_step) with torch.no_grad(): logged = step_loss.clone() dist.all_reduce(logged, op=dist.ReduceOp.SUM) losses.append(float(logged.item())) - print_rank0(f"step={step:03d} loss={losses[-1]:.6f}") - + print_rank0(f"step={step:03d} loss={losses[-1]:.6f} step_time={1000*step_times[-1]:.0f}ms") + + warmup = min(3, max(0, len(step_times) - 2)) + steady = sorted(step_times[warmup:]) + if steady: + mean_ms = 1000 * sum(steady) / len(steady) + print_rank0(f"[latency] solver={args.solver} per-step (excl {warmup} warmup, " + f"{len(steady)} steps): mean={mean_ms:.0f}ms " + f"median={1000*steady[len(steady)//2]:.0f}ms min={1000*steady[0]:.0f}ms") print_rank0(f"\nloss curve: {[round(x, 4) for x in losses]}") verdict = "PASS" if losses[-1] < losses[0] else "FAIL" print_rank0(f"SANITY {verdict}: loss {losses[0]:.4f} -> {losses[-1]:.4f}") diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py index 0383fad8..05d05314 100644 --- a/tests/test_approximate_sharding.py +++ b/tests/test_approximate_sharding.py @@ -3,6 +3,7 @@ # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. +import logging import math import pulp @@ -89,6 +90,44 @@ def test_approx_objective_close_to_ilp(): ) +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +@pytest.mark.filterwarnings("ignore:Overwriting previously set objective") +def test_lp_solver_matches_ilp(): + """The LP-relaxation solver returns an integral, ILP-feasible assignment whose + objective equals the exact ILP optimum (the relaxation is integral here).""" + mesh = _fake_2d_mesh() + with _tiny_llama3_autop(mesh) as autop: + _add_constraints(autop, mesh) + opt = autop.sharding_optimizer + + autop.optimize_placement(verbose=False, solver="lp") + lp_objective = pulp.value(opt.prob.objective) + violated = [n for n, c in opt.prob.constraints.items() if not c.valid()] + assert not violated, f"lp violated {len(violated)} constraints" + + autop.optimize_placement(verbose=False, solver="ilp") + ilp_objective = pulp.value(opt.prob.objective) + + assert math.isfinite(lp_objective) + assert lp_objective == pytest.approx(ilp_objective, rel=1e-6) + + +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +@pytest.mark.filterwarnings("ignore:Overwriting previously set objective") +def test_optimality_check_logs_certified_gap(caplog): + """optimality_check=True solves the LP lower bound and logs the certified gap.""" + mesh = _fake_2d_mesh() + with _tiny_llama3_autop(mesh) as autop: + _add_constraints(autop, mesh) + with caplog.at_level(logging.INFO, logger="autoparallel.api"): + autop.optimize_placement( + verbose=False, solver="approx", optimality_check=True + ) + assert any("optimality check" in r.message for r in caplog.records) + + @apply_cuda_patches @pytest.mark.filterwarnings("ignore:Constructing LpVariable") def test_approx_objective_is_faithful(): From 1435b7bcde1fc5fad3088871a63b0b796046ac02 Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 7 Jun 2026 11:00:11 -0700 Subject: [PATCH 24/27] Approx solver: memory-constrained solve via Lagrangian relaxation The parameter-memory budget Sum_param ratio*x in [low,high] is a single node-separable linear coupling, so penalizing it by lambda folds lambda*ratio into the param unaries and leaves the pairwise MRF untouched. A scalar bisection on lambda drives the achieved memory into the budget, and a budget-constrained coordinate/star polish recovers integer solutions inside the (memory, cost) hull that no single lambda reaches. Routed in only for non-tight budgets; the tight default still uses build-time param pinning. Authored with Claude. --- autoparallel/approximate_sharding.py | 214 ++++++++++++++++++++++-- examples/_bench_mem_lagrangian.py | 237 +++++++++++++++++++++++++++ tests/test_approximate_sharding.py | 35 ++++ 3 files changed, 470 insertions(+), 16 deletions(-) create mode 100644 examples/_bench_mem_lagrangian.py diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py index ec44d7df..3e102da5 100644 --- a/autoparallel/approximate_sharding.py +++ b/autoparallel/approximate_sharding.py @@ -151,6 +151,11 @@ def __init__( self.consumers: dict[int, list[tuple[int, int]]] = defaultdict(list) self.cur_out: dict[int, int] = {} self._memory: Optional[dict[str, Any]] = None + # When False, the hard memory-budget checks in local search are skipped + # (used by the Lagrangian solve, which enforces the budget softly via a + # penalty folded into the unaries instead). + self._mem_enforce: bool = True + self._mem_unary: list[np.ndarray] = [] # Populated by _build_factors(). self.g_unary: list[np.ndarray] = [] @@ -197,23 +202,42 @@ def _solve(self, verbose: bool = False): # to be compared against is strictly dominated and has been dropped; the # polish remains for the memory budget and as a local-search safety net. t_bp0 = time.perf_counter() - self._belief_propagation(deadline) - if verbose: - logger.info( - "approx phase: trws iter=%s delta=%.4g in %.2fs; " "decode energy=%.1f", - getattr(self, "_bp_last_iter", None), - getattr(self, "_bp_last_delta", float("nan")), - time.perf_counter() - t_bp0, - self._fast_total_energy(), + mem = self._memory + if mem is not None and not mem.get("tight"): + # A non-tight budget can bind the runtime-optimal placement; solve it + # exactly via Lagrangian relaxation (folds λ·ratio into the unaries). + # A tight budget is already handled by build-time param pinning, and + # the no-memory case has nothing to relax, so both take the plain path. + res = self.solve_lagrangian( + mem["budget_low"], + mem["budget_high"], + deadline=deadline, + verbose=verbose, ) - self._memory_repair() - self._coordinate_descent(deadline) - if verbose: - logger.info("approx phase: trws+cd energy=%.1f", self._fast_total_energy()) - self._star_block_search(deadline) + if verbose: + logger.info( + "approx phase: lagrangian lam=%.4g memory=%.4f feasible=%s", + res["lam"], + res["memory"], + res["feasible"], + ) + else: + self._belief_propagation(deadline) + if verbose: + logger.info( + "approx phase: trws iter=%s delta=%.4g in %.2fs; " + "decode energy=%.1f", + getattr(self, "_bp_last_iter", None), + getattr(self, "_bp_last_delta", float("nan")), + time.perf_counter() - t_bp0, + self._fast_total_energy(), + ) + self._memory_repair() + self._coordinate_descent(deadline) + self._star_block_search(deadline) bp_energy = self._fast_total_energy() if verbose: - logger.info("approx phase: trws+cd+star energy=%.1f", bp_energy) + logger.info("approx phase: polished energy=%.1f", bp_energy) t_solve = time.perf_counter() - t0 - t_build objective = self._write_back() @@ -1236,7 +1260,7 @@ def _current_memory(self): ) def _memory_ok_after(self, gid, ci): - if self._memory is None or self._memory.get("tight"): + if self._memory is None or self._memory.get("tight") or not self._mem_enforce: return True ratios = self._memory["ratios"] choice = self.groups[gid].choices[ci] @@ -1253,7 +1277,7 @@ def _memory_ok_after(self, gid, ci): ) def _block_memory_ok(self): - if self._memory is None or self._memory.get("tight"): + if self._memory is None or self._memory.get("tight") or not self._mem_enforce: return True mem = self._current_memory() return ( @@ -1308,6 +1332,164 @@ def _memory_repair(self): return self._set_group(best[1], best[2]) + # ------------------------------------------------------------------ # + # Lagrangian memory-constrained solve + # ------------------------------------------------------------------ # + def _build_mem_unary(self): + """Per-group vector mem_unary[gid][ci] = Σ_{param member} ratio[member][ci], + i.e. the memory term as a node-separable unary so it folds into the + Lagrangian objective with no change to the pairwise structure.""" + self._mem_unary = [np.zeros(g.domain) for g in self.groups] + if self._memory is None: + return + ratios = self._memory["ratios"] + for v in self._memory["param_idxs"]: + gid = self.node_to_group.get(v) + if gid is None: + continue + r = ratios[v] + self._mem_unary[gid] += np.array( + [r[c[v]] for c in self.groups[gid].choices] + ) + + def _run_search(self, deadline): + self._belief_propagation(deadline) + self._coordinate_descent(deadline) + self._star_block_search(deadline) + + def solve_lagrangian( + self, + budget_low, + budget_high, + deadline=None, + max_iter=30, + lam_tol=1e-9, + verbose=False, + ): + """Memory-constrained solve via Lagrangian relaxation. + + The budget Σ_param ratio[v][x_v] ∈ [low, high] is a single linear, + node-separable coupling. Penalizing it by λ folds λ·ratio into each param + node's unary and leaves the pairwise MRF untouched, so TRW-S + polish + solves the penalized problem directly. a(λ) := Σ ratio at the optimum is + monotone non-increasing in λ (larger λ ⇒ more sharding ⇒ less memory), so + a scalar bisection on λ ≥ 0 drives a(λ) into the budget. The cheapest + feasible assignment seen is kept; the existing greedy repair only closes + any small residual from integrality. + + Leaves the solver at the chosen assignment (does not write back) and + returns a dict: objective (true), memory (achieved a), lam, feasible, + iters.""" + if not self._mem_unary: + self._build_mem_unary() + t_start = time.perf_counter() + if deadline is None: + deadline = t_start + self.max_time_s + # Reserve the tail of the budget for the constrained polish below. + bisect_deadline = t_start + 0.6 * (deadline - t_start) + base = [u.copy() for u in self.g_unary] + prev_enforce = self._mem_enforce + self._mem_enforce = False + eps = 1e-6 + + best = {"objective": INF, "snapshot": None, "memory": None, "lam": None} + # Closest over-budget assignment (smallest excess memory) — the seed the + # repair step nudges down into the budget to recover integer solutions + # that lie inside the (memory, cost) hull and so no lambda can reach. + seed = {"memory": INF, "snapshot": None} + + def evaluate(lam): + for gid in range(len(self.groups)): + self.g_unary[gid] = base[gid] + lam * self._mem_unary[gid] + self._run_search(bisect_deadline) + a = self._current_memory() + obj = self.total_objective() + feasible = budget_low - eps <= a <= budget_high + eps + if feasible and obj < best["objective"]: + best.update( + objective=obj, + snapshot=[g.current for g in self.groups], + memory=a, + lam=lam, + ) + if budget_high + eps < a < seed["memory"]: + seed.update(memory=a, snapshot=[g.current for g in self.groups]) + if verbose: + logger.info( + "lagrangian: lam=%.6g memory=%.5f obj=%.2f feasible=%s", + lam, + a, + obj, + feasible, + ) + return a + + a0 = evaluate(0.0) + iters = 1 + if a0 <= budget_high + eps: + lam = 0.0 # unconstrained optimum already fits the budget + else: + lo_lam, hi_lam = 0.0, 1.0 + while evaluate(hi_lam) > budget_high + eps and iters < max_iter: + lo_lam, hi_lam = hi_lam, hi_lam * 2.0 + iters += 1 + while iters < max_iter and hi_lam - lo_lam > lam_tol: + mid = 0.5 * (lo_lam + hi_lam) + a = evaluate(mid) + iters += 1 + if a > budget_high + eps: + lo_lam = mid # still over budget, penalize harder + else: + hi_lam = mid # feasible, try to relax toward the cheaper side + lam = hi_lam + + for gid in range(len(self.groups)): + self.g_unary[gid] = base[gid] + self._mem_enforce = prev_enforce + + # Constrained polish (under the base unary, budget enforced). No single λ + # recovers integer solutions inside the (memory, cost) hull; coordinate + + # star search restricted to the budget can climb from an over-sharded + # point back up to a cheaper intermediate-memory one. We polish both the + # bisection's feasible point and the repaired closest-over-budget seed and + # keep the cheapest feasible result. + def polish(snapshot): + for gid, ci in enumerate(snapshot): + self._set_group(gid, ci) + self._memory_repair() + self._coordinate_descent(deadline) + self._star_block_search(deadline) + a = self._current_memory() + if budget_low - eps <= a <= budget_high + eps: + obj = self.total_objective() + if obj < best["objective"]: + best.update( + objective=obj, + snapshot=[g.current for g in self.groups], + memory=a, + lam=lam, + ) + + for snap in (best["snapshot"], seed["snapshot"]): + if snap is not None: + polish(snap) + + if best["snapshot"] is not None: + for gid, ci in enumerate(best["snapshot"]): + self._set_group(gid, ci) + else: + # Nothing landed in [low, high]; repair the last assignment in place. + self._memory_repair() + + a = self._current_memory() + return { + "objective": self.total_objective(), + "memory": a, + "lam": lam, + "feasible": budget_low - eps <= a <= budget_high + eps, + "iters": iters, + } + # ------------------------------------------------------------------ # # Write-back # ------------------------------------------------------------------ # diff --git a/examples/_bench_mem_lagrangian.py b/examples/_bench_mem_lagrangian.py new file mode 100644 index 00000000..6166a552 --- /dev/null +++ b/examples/_bench_mem_lagrangian.py @@ -0,0 +1,237 @@ +"""Compare the Lagrangian memory-constrained approximate solve against the LP +(relaxation) optimum across a sweep of parameter-memory budgets. + +The optimizer (the expensive build) is constructed ONCE; each budget only +re-runs the cheap solves. For every budget factor `high` (with low=0): + - LP: set the memory constraint and solve the (integral) relaxation -> the + exact constrained optimum (gold standard). + - Lagrangian approx: fold lambda * ratio into the unaries and bisect lambda + until the achieved memory lands in the same [low, high] budget. +The two solvers are pinned to the SAME numeric budget (read back from the LP's +constraint rows) so the comparison is apples-to-apples. + +Env: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN, +HIGH_FACTORS (comma list, default sweep), BP_ITERS. +""" +import logging +import os +import time +from unittest.mock import patch + +import pulp +import torch +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor.placement_types import Replicate, Shard +from torch.testing._internal.distributed.fake_pg import FakeStore + +from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs +from autoparallel.api import AutoParallel +from autoparallel.approximate_sharding import ApproximateShardingSolver +from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config +from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config + +logging.basicConfig(level=logging.ERROR) + + +def log(msg): + print(msg, flush=True) + + +_PATCHES = [ + patch("torch.cuda.device_count", lambda: 8), + patch("torch.cuda.get_device_name", lambda *a, **k: "H100"), + patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)), + patch( + "torch.cuda.get_device_properties", + lambda *a, **k: type( + "P", + (), + { + "major": 9, + "minor": 0, + "name": "H100", + "total_memory": 80 * 1024**3, + "multi_processor_count": 132, + }, + )(), + ), +] +for p in _PATCHES: + p.start() + +MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b") +N_LAYERS = int(os.environ.get("N_LAYERS", "0")) +SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4))) +MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) +BP_ITERS = int(os.environ.get("BP_ITERS", "120")) +HIGH_FACTORS = [ + float(x) + for x in os.environ.get( + "HIGH_FACTORS", "0.0156,0.03125,0.0625,0.125,0.25,0.5,1.0" + ).split(",") +] +# On budgets where the LP relaxation is fractional (its optimum is an +# unachievable lower bound) also solve the true ILP to report the achievable gap. +RUN_ILP = os.environ.get("RUN_ILP", "0") == "1" +ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "300")) + +world_size = 1 +for d in MESH_SHAPE: + world_size *= d + +_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp")} +mesh_names = _NAMES[len(MESH_SHAPE)] +fake_store = FakeStore() +torch.distributed.init_process_group( + "fake", store=fake_store, rank=0, world_size=world_size +) +mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", MESH_SHAPE, mesh_dim_names=mesh_names +) + +vocab_size = 128256 +batch_size = int(os.environ.get("BATCH", str(2 * mesh.shape[0]))) +seqlen = SEQLEN + + +def model_fn(): + args = TransformerModelArgs( + dim=2048, + n_layers=16, + n_heads=32, + n_kv_heads=8, + ffn_dim_multiplier=1.5, + multiple_of=256, + rope_theta=500000, + vocab_size=vocab_size, + max_seq_len=seqlen, + ) + if MODEL_TYPE == "8b": + args = TransformerModelArgs( + dim=4096, + n_layers=32, + n_heads=32, + n_kv_heads=8, + ffn_dim_multiplier=1.3, + multiple_of=1024, + rope_theta=500000, + vocab_size=vocab_size, + max_seq_len=seqlen, + ) + if N_LAYERS: + args.n_layers = N_LAYERS + with torch.device("meta"): + return Transformer(args) + + +def input_fn(): + return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") + + +set_nccl_topo_config(detect_nccl_topo_config(mesh)) +mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + +log( + f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} " + f"world={world_size} seqlen={seqlen} bp_iters={BP_ITERS}" +) + +# ---- build once ---- +t = time.perf_counter() +autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True) +autop.__enter__() +ndim = mesh.ndim +x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1) +out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding +# Build with a LOOSE budget so the approx build does not pin params to the +# min-ratio (fully-sharded) choices; the per-budget sweep overrides the budget +# numerically afterward. (A tight default would prune param strategies at build +# time and freeze the achievable memory.) +autop.add_parameter_memory_constraint(low=0.0, high=1.0) +autop.add_input_constraints([x_sharding]) +autop.add_output_constraints([out_sharding]) +opt = autop.sharding_optimizer +log( + f"[build] optimizer ready in {time.perf_counter() - t:.2f}s " + f"vars={len(opt.pulp_variables)} nodes={len(opt.nodes)}" +) + +# build the approximate solver once (ratios / factor graph / mem unary cached) +t = time.perf_counter() +approx = ApproximateShardingSolver(opt, bp_iters=BP_ITERS) +approx._build_problem() +approx._build_factors() +approx._build_mem_unary() +log( + f"[build] approx solver ready in {time.perf_counter() - t:.2f}s " + f"groups={len(approx.groups)} " + f"params={len(approx._memory['param_idxs']) if approx._memory else 0}" +) +opt._set_objective() + + +def lp_budget(): + """Read back the exact [low, high] the LP applied, so approx uses the same.""" + ch = opt.prob.constraints["memory_constraint_high"] + cl = opt.prob.constraints["memory_constraint_low"] + return -cl.constant, -ch.constant + + +log("\n" + "=" * 110) +log( + f"{'high_f':>8} | {'budget':>16} | {'LP obj':>12} {'frac':>7} {'LP s':>6} | " + f"{'approx obj':>12} {'mem':>7} {'lam':>9} {'feas':>5} {'s':>5} | " + f"{'gap/LP':>7} {'ILP obj':>12} {'gap/ILP':>8}" +) +log("-" * 110) + +rows = [] +for hf in HIGH_FACTORS: + opt._memory_constraint = (0.0, hf) + t = time.perf_counter() + lp = opt.solve_lp_relaxation(verbose=False, extract=False) + lp_s = time.perf_counter() - t + lp_obj = lp["objective"] + frac = f"{lp['n_fractional']}/{lp['n_vars']}" + blow, bhigh = lp_budget() + + approx._memory["budget_low"] = blow + approx._memory["budget_high"] = bhigh + approx._memory["tight"] = abs(bhigh - blow) < 1e-9 + t = time.perf_counter() + res = approx.solve_lagrangian(blow, bhigh, max_iter=24) + ap_s = time.perf_counter() - t + ap_obj = res["objective"] + gap = (ap_obj - lp_obj) / lp_obj * 100 if lp_obj else float("nan") + + ilp_obj, gap_ilp = None, None + if RUN_ILP and lp["n_fractional"] > 0: + opt._set_objective() + opt._apply_memory_constraint() + opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, timeLimit=ILP_TIMEOUT)) + ilp_obj = pulp.value(opt.prob.objective) + gap_ilp = (ap_obj - ilp_obj) / ilp_obj * 100 if ilp_obj else float("nan") + + rows.append((hf, lp_obj, ap_obj, gap, res["feasible"], ilp_obj, gap_ilp)) + log( + f"{hf:>8.4g} | [{blow:>6.3f},{bhigh:>7.3f}] | {lp_obj:>12.1f} {frac:>7} " + f"{lp_s:>5.1f}s | {ap_obj:>12.1f} {res['memory']:>7.3f} {res['lam']:>9.4g} " + f"{str(res['feasible']):>5} {ap_s:>4.1f}s | {gap:>+6.2f}% " + f"{('%.1f' % ilp_obj) if ilp_obj else '-':>12} " + f"{('%+.2f%%' % gap_ilp) if gap_ilp is not None else '-':>8}" + ) + +log("=" * 110) +gaps = [r[3] for r in rows if r[1]] +feas = [r[4] for r in rows] +if gaps: + log( + f"gap vs LP: mean={sum(gaps)/len(gaps):+.2f}% max={max(gaps):+.2f}% " + f"min={min(gaps):+.2f}% feasible={sum(feas)}/{len(feas)}" + ) +gi = [r[6] for r in rows if r[6] is not None] +if gi: + log( + f"gap vs ILP (fractional-LP budgets): mean={sum(gi)/len(gi):+.2f}% " + f"max={max(gi):+.2f}%" + ) diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py index 05d05314..b75ebe52 100644 --- a/tests/test_approximate_sharding.py +++ b/tests/test_approximate_sharding.py @@ -90,6 +90,41 @@ def test_approx_objective_close_to_ilp(): ) +@apply_cuda_patches +@pytest.mark.filterwarnings("ignore:Constructing LpVariable") +@pytest.mark.filterwarnings("ignore:Overwriting previously set objective") +def test_approx_memory_constrained_matches_ilp(): + """A non-tight parameter-memory budget routes the approx solver through the + Lagrangian relaxation. The result must respect the budget and stay within a + small objective gap of the budget-constrained ILP optimum.""" + mesh = _fake_2d_mesh() + with _tiny_llama3_autop(mesh) as autop: + # high=0.5 > 1/world_size, so the budget is non-tight (params are not + # pinned at build time) and can bind the runtime-optimal placement. + autop.add_parameter_memory_constraint(low=0.0, high=0.5) + autop.add_input_constraints([(Shard(0),) + (Replicate(),) * (mesh.ndim - 1)]) + autop.add_output_constraints([(Shard(0), Shard(2))]) + opt = autop.sharding_optimizer + + autop.optimize_placement(verbose=False, solver="approx") + approx_objective = pulp.value(opt.prob.objective) + # Materialize the memory rows and check the approx assignment against ALL + # constraints, including the budget it was solved under. + opt._apply_memory_constraint() + violated = [n for n, c in opt.prob.constraints.items() if not c.valid()] + assert not violated, f"approx violated {len(violated)} constraints" + + autop.optimize_placement(verbose=False, solver="ilp") + ilp_objective = pulp.value(opt.prob.objective) + + assert math.isfinite(approx_objective) + assert approx_objective >= ilp_objective - 1e-6 # ILP is optimal + assert approx_objective <= ilp_objective * 1.05 + 1e-6, ( + f"approx={approx_objective} ilp={ilp_objective} " + f"gap={(approx_objective / ilp_objective - 1) * 100:.2f}%" + ) + + @apply_cuda_patches @pytest.mark.filterwarnings("ignore:Constructing LpVariable") @pytest.mark.filterwarnings("ignore:Overwriting previously set objective") From 99339c3015f91bb545d6fa2089f474e2a319952c Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Sun, 7 Jun 2026 11:17:40 -0700 Subject: [PATCH 25/27] Mark annotation propagation and DP solver as experimental The Shardy-like annotation propagation (annotate_* / propagate_annotations) and the DP-based solver are opt-in and off by default: annotations do nothing unless explicitly propagated before optimize_placement(), and the DP solver is only reachable via the non-default solver_backend="dp" (not exposed through AutoParallel) and still raises NotImplementedError. Document them as experimental / unstable so the default solve path is unambiguous. Authored with Claude. --- autoparallel/api.py | 12 ++++++++++-- autoparallel/optimize_sharding.py | 21 +++++++++++++-------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/autoparallel/api.py b/autoparallel/api.py index ef664dc4..5e99d7d4 100644 --- a/autoparallel/api.py +++ b/autoparallel/api.py @@ -378,6 +378,11 @@ def add_output_constraints(self, constraints): self.output_constraints = constraints # ---- Sharding annotations (Shardy-like propagation) ---- + # EXPERIMENTAL: opt-in only. These have no effect unless you call an + # annotate_* method and then propagate_annotations() before + # optimize_placement(); the default solve path never invokes them. The + # propagation may shrink the search space in ways that move the objective off + # the full-ILP optimum, so treat results as unstable. def _normalize_placements(self, placements): """Pad/validate a placement tuple to mesh.ndim, leaving missing trailing @@ -499,12 +504,15 @@ def _mirror_annotations_to_backward(self): return mirrored def propagate_annotations(self, verbose=True, aggressive=False, method="fix"): - """Propagate the registered annotations Shardy-style and turn the + """EXPERIMENTAL (opt-in, off by default; may be unstable). + + Propagate the registered annotations Shardy-style and turn the unambiguously-determined nodes into ILP constraints, shrinking the search space. Returns a :class:`PropagationResult`. Call this after the ``annotate_*`` / ``add_*_constraint`` calls and - before :meth:`optimize_placement`. + before :meth:`optimize_placement`. The default solve path does not call + this; nothing happens unless you invoke it explicitly. With ``aggressive=False`` (the default) only genuine ``Shard`` axes are pinned, which keeps the full-ILP optimum reachable. ``aggressive=True`` diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index 9e73889e..b14cb142 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -172,9 +172,7 @@ def _par_node_edge_costs(node_idx): arg_rows = [] for argi, redist_costs in enumerate(output_strategy.redistribute_cost): producer_strategy = ( - producer_strategies[argi] - if argi < len(producer_strategies) - else None + producer_strategies[argi] if argi < len(producer_strategies) else None ) arg_rows.append( [ @@ -193,7 +191,6 @@ def _par_node_edge_costs(node_idx): return node_idx, out_data - def concretize_symint(val): """Concretize a SymInt to a plain int, pass through other values. @@ -308,6 +305,14 @@ class DPTopology: class DPBasedShardingSolver: + """EXPERIMENTAL / incomplete — not part of the supported solver path. + + Only reachable when ``ShardingOptimizer`` is built with the non-default + ``solver_backend="dp"`` (not exposed through ``AutoParallel``), and today it + only builds a topological order: :meth:`get_solution` raises + ``NotImplementedError``. Kept for in-progress work; do not rely on it. + """ + def __init__(self, optimizer): self.optimizer = optimizer self.topology: Optional[DPTopology] = None @@ -1133,9 +1138,7 @@ def _compute_node_edge_costs(self, root_idxs): # order as the serial path. This keeps the PuLP objective's # lpSum term order identical too, so even the ILP path is # bit-for-bit unchanged (float addition is not associative). - return list( - pool.imap(_par_node_edge_costs, root_idxs, chunksize=4) - ) + return list(pool.imap(_par_node_edge_costs, root_idxs, chunksize=4)) finally: _FORK_OPT = None @@ -1167,7 +1170,9 @@ def _find_decision_var(self, node_idx, argi, out_idx): that only need per-strategy costs can use whichever edge survived. """ strategy = self.strats[self.nodes[node_idx]].strategies[out_idx] - n_inp = len(strategy.redistribute_cost[argi]) if strategy.redistribute_cost else 1 + n_inp = ( + len(strategy.redistribute_cost[argi]) if strategy.redistribute_cost else 1 + ) for inp_idx in range(n_inp): key = (node_idx, argi, out_idx, inp_idx) if key in self.decision_vars: From 65c95036f817ea172f4cb4c8004a45a3b7b6f8ca Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Mon, 8 Jun 2026 12:08:51 -0700 Subject: [PATCH 26/27] Prepare approx-solver PR: drop qwen3/scratch benches, fix test cache isolation Excludes the Qwen3 model + examples (shipped in a separate PR) and the scratch _bench_/_sanity scripts from this branch, and applies black/isort formatting. Adds an autouse conftest fixture that clears the process-global placement-options cache before each test, so an optimizer build never reuses stale strategies from a prior test's model (this otherwise made test_lp_relaxation fail when run after test_approximate_sharding). Authored with Claude. --- autoparallel/_testing/models/dsv3.py | 2 +- autoparallel/_testing/models/qwen3.py | 976 --------------------- autoparallel/serialization.py | 3 +- examples/_bench_3d_cert.py | 154 ---- examples/_bench_anno.py | 116 --- examples/_bench_approx.py | 166 ---- examples/_bench_approx_diag.py | 173 ---- examples/_bench_approx_ils.py | 136 --- examples/_bench_approx_sweep.py | 106 --- examples/_bench_build_profile.py | 93 -- examples/_bench_build_verify.py | 92 -- examples/_bench_dp_alone.py | 103 --- examples/_bench_lp_3d.py | 107 --- examples/_bench_lp_integrality.py | 118 --- examples/_bench_mem_lagrangian.py | 237 ----- examples/_bench_merge.py | 293 ------- examples/_bench_sizes.py | 166 ---- examples/_bench_trws.py | 173 ---- examples/_sanity_llama3.py | 223 ----- examples/example_llama3.py | 3 - examples/example_qwen3.py | 242 ----- examples/example_sanity_check_qwen3.py | 335 ------- examples/example_sanity_check_qwen3_moe.py | 466 ---------- examples/example_torchtitan_qwen3_dense.py | 370 -------- tests/conftest.py | 10 + tests/test_dsv3_torchtitan_config.py | 35 - tests/test_optimize_placement.py | 4 +- tests/test_qwen3.py | 323 ------- 28 files changed, 13 insertions(+), 5212 deletions(-) delete mode 100644 autoparallel/_testing/models/qwen3.py delete mode 100644 examples/_bench_3d_cert.py delete mode 100644 examples/_bench_anno.py delete mode 100644 examples/_bench_approx.py delete mode 100644 examples/_bench_approx_diag.py delete mode 100644 examples/_bench_approx_ils.py delete mode 100644 examples/_bench_approx_sweep.py delete mode 100644 examples/_bench_build_profile.py delete mode 100644 examples/_bench_build_verify.py delete mode 100644 examples/_bench_dp_alone.py delete mode 100644 examples/_bench_lp_3d.py delete mode 100644 examples/_bench_lp_integrality.py delete mode 100644 examples/_bench_mem_lagrangian.py delete mode 100644 examples/_bench_merge.py delete mode 100644 examples/_bench_sizes.py delete mode 100644 examples/_bench_trws.py delete mode 100644 examples/_sanity_llama3.py delete mode 100644 examples/example_qwen3.py delete mode 100644 examples/example_sanity_check_qwen3.py delete mode 100644 examples/example_sanity_check_qwen3_moe.py delete mode 100644 examples/example_torchtitan_qwen3_dense.py delete mode 100644 tests/test_dsv3_torchtitan_config.py delete mode 100644 tests/test_qwen3.py diff --git a/autoparallel/_testing/models/dsv3.py b/autoparallel/_testing/models/dsv3.py index 05f78a92..5a897b71 100644 --- a/autoparallel/_testing/models/dsv3.py +++ b/autoparallel/_testing/models/dsv3.py @@ -1581,7 +1581,7 @@ def __init__( route_norm=moe_cfg.router.route_norm, route_scale=moe_cfg.router.route_scale, score_before_experts=moe_cfg.experts.token_dispatcher.score_before_experts, - use_grouped_mm=getattr(moe_cfg.experts, "use_grouped_mm", True), + use_grouped_mm=moe_cfg.experts.use_grouped_mm, load_balance_coeff=moe_cfg.load_balance_coeff, mesh=mesh, compute_dtype=compute_dtype, diff --git a/autoparallel/_testing/models/qwen3.py b/autoparallel/_testing/models/qwen3.py deleted file mode 100644 index 7bef8b17..00000000 --- a/autoparallel/_testing/models/qwen3.py +++ /dev/null @@ -1,976 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import math -from dataclasses import dataclass -from typing import Callable, ClassVar, Optional - -import torch -import torch.nn.functional as F -from torch import nn -from torch.distributed.tensor import DeviceMesh -from torch.distributed.tensor.placement_types import Partial, Replicate, Shard -from torch.fx import traceback as fx_traceback -from torch.nn.attention import sdpa_kernel, SDPBackend - -from autoparallel._testing.models.dsv3 import ( - _permute, - _run_experts_for_loop, - _run_experts_grouped_mm, - _token_combine, -) -from autoparallel.collectives import all_to_all, axis_size, local_map - - -def has_cuda_capability(major: int, minor: int) -> bool: - return torch.cuda.is_available() and torch.cuda.get_device_capability() >= ( - major, - minor, - ) - - -class ScaledDotProductAttention(torch.nn.Module): - backends: ClassVar[list[SDPBackend]] = [] - - def __init__(self, attn_mask_type: str) -> None: - super().__init__() - if attn_mask_type != "causal": - raise ValueError("Qwen3 with SDPA currently only supports causal mask.") - - ScaledDotProductAttention._init_backend() - - @classmethod - def _init_backend(cls) -> None: - if cls.backends: - return - - cls.backends = [ - SDPBackend.FLASH_ATTENTION, - SDPBackend.EFFICIENT_ATTENTION, - SDPBackend.MATH, - ] - if has_cuda_capability(10, 0): - cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION) - - def forward( - self, - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - scale: float | None = None, - ) -> torch.Tensor: - assert self.backends, "SDPA backends should not be empty." - with sdpa_kernel(self.backends, set_priority=True): - return F.scaled_dot_product_attention( - q, - k, - v, - is_causal=True, - scale=scale, - ) - - -def build_attention(attn_mask_type: str): - if attn_mask_type != "causal": - raise ValueError("Qwen3 with SDPA currently only supports causal mask.") - return ScaledDotProductAttention(attn_mask_type) - - -@dataclass -class Qwen3ModelArgs: - dim: int = 4096 - n_layers: int = 36 - n_heads: int = 32 - n_kv_heads: Optional[int] = 8 - head_dim: int = 128 - hidden_dim: int = 12288 - vocab_size: int = 151936 - norm_eps: float = 1e-6 - rope_theta: float = 1000000.0 - max_seq_len: int = 4096 - depth_init: bool = True - attn_mask_type: str = "causal" - eos_id: int = 0 - enable_weight_tying: bool = False - moe_enabled: bool = False - moe_hidden_dim: int = 768 - num_experts: int = 64 - top_k: int = 8 - route_norm: bool = True - route_scale: float = 1.0 - score_before_experts: bool = False - use_grouped_mm: bool = True - load_balance_coeff: Optional[float] = 1e-3 - moe_axis_name: str = "ep" - - def __post_init__(self) -> None: - n_kv_heads = self.n_heads if self.n_kv_heads is None else self.n_kv_heads - if self.n_heads % n_kv_heads != 0: - raise ValueError( - f"n_heads ({self.n_heads}) must be divisible by " - f"n_kv_heads ({n_kv_heads})." - ) - if self.moe_enabled and self.top_k > self.num_experts: - raise ValueError( - f"top_k ({self.top_k}) must be <= num_experts ({self.num_experts})." - ) - - def update_from_config(self, job_config, tokenizer) -> None: - self.vocab_size = tokenizer.n_words - self.max_seq_len = job_config.training.seq_len - self.eos_id = tokenizer.eos_id - - def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - nparams = sum(p.numel() for p in model.parameters()) - nparams_embedding = sum( - sum(p.numel() for p in m.parameters()) - for m in model.children() - if isinstance(m, nn.Embedding) - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.head_dim, - seq_len, - ) - num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t - return nparams, num_flops_per_token - - -def qwen3_args_from_torchtitan_config(config) -> Qwen3ModelArgs: - """Build AutoParallel Qwen3 args from TorchTitan's Qwen3Model.Config.""" - if not config.layers: - raise ValueError("Qwen3 config must contain at least one layer.") - - first_layer = config.layers[0] - attention = first_layer.attention - moe = first_layer.moe - - if getattr(attention, "fuse_qkv", False): - raise ValueError("AutoParallel Qwen3 does not support fused QKV yet.") - - moe_enabled = moe is not None - if moe_enabled: - hidden_dim = 0 - moe_hidden_dim = moe.experts.hidden_dim - num_experts = moe.num_experts - top_k = moe.router.top_k - route_norm = moe.router.route_norm - route_scale = moe.router.route_scale - score_before_experts = moe.experts.token_dispatcher.score_before_experts - load_balance_coeff = moe.load_balance_coeff - else: - hidden_dim = first_layer.feed_forward.w1.out_features - moe_hidden_dim = 0 - num_experts = 0 - top_k = 1 - route_norm = True - route_scale = 1.0 - score_before_experts = False - load_balance_coeff = None - - return Qwen3ModelArgs( - dim=config.dim, - n_layers=len(config.layers), - n_heads=attention.n_heads, - n_kv_heads=attention.n_kv_heads, - head_dim=attention.head_dim, - hidden_dim=hidden_dim, - vocab_size=config.vocab_size, - norm_eps=config.norm.eps, - rope_theta=config.rope.theta, - max_seq_len=config.rope.max_seq_len, - attn_mask_type=attention.mask_type, - enable_weight_tying=config.enable_weight_tying, - moe_enabled=moe_enabled, - moe_hidden_dim=moe_hidden_dim, - num_experts=num_experts, - top_k=top_k, - route_norm=route_norm, - route_scale=route_scale, - score_before_experts=score_before_experts, - load_balance_coeff=load_balance_coeff, - ) - - -def qwen3_debug_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=256, - n_layers=8, - n_heads=16, - n_kv_heads=8, - head_dim=128, - hidden_dim=3072, - vocab_size=2048, - max_seq_len=4096, - enable_weight_tying=True, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_0_6b_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=1024, - n_layers=28, - n_heads=16, - n_kv_heads=8, - head_dim=128, - hidden_dim=3072, - vocab_size=151936, - enable_weight_tying=True, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_1_7b_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=2048, - n_layers=28, - n_heads=16, - n_kv_heads=8, - head_dim=128, - hidden_dim=6144, - vocab_size=151936, - enable_weight_tying=True, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_4b_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=2560, - n_layers=36, - n_heads=32, - n_kv_heads=8, - head_dim=128, - hidden_dim=9728, - vocab_size=151936, - enable_weight_tying=True, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_8b_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs() - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_moe_debug_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=256, - n_layers=8, - n_heads=16, - n_kv_heads=8, - head_dim=128, - hidden_dim=3072, - vocab_size=2048, - max_seq_len=4096, - moe_enabled=True, - moe_hidden_dim=768, - num_experts=64, - top_k=8, - route_norm=True, - score_before_experts=False, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_30b_a3b_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=2048, - n_layers=48, - n_heads=32, - n_kv_heads=4, - head_dim=128, - hidden_dim=6144, - vocab_size=151936, - max_seq_len=262144, - moe_enabled=True, - moe_hidden_dim=768, - num_experts=128, - top_k=8, - route_norm=True, - score_before_experts=False, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def qwen3_235b_a22b_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=4096, - n_layers=94, - n_heads=64, - n_kv_heads=4, - head_dim=128, - hidden_dim=12288, - vocab_size=151936, - max_seq_len=4096, - rope_theta=5000000.0, - moe_enabled=True, - moe_hidden_dim=1536, - num_experts=128, - top_k=8, - route_norm=True, - score_before_experts=False, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def precompute_freqs_cos_sin( - dim: int, - max_seq_len: int, - theta: float = 1000000.0, -) -> torch.Tensor: - freq = theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) - inv_freq = 1.0 / freq - t = torch.arange(max_seq_len, dtype=inv_freq.dtype, device=inv_freq.device) - freqs = torch.outer(t, inv_freq).float() - freqs = torch.cat([freqs, freqs], dim=-1) - cos = freqs.cos() - sin = freqs.sin() - return torch.cat([cos, sin], dim=-1) - - -def reshape_for_broadcast_cos_sin( - rope_cache: torch.Tensor, - x: torch.Tensor, -) -> torch.Tensor: - bsz, seqlen, _, head_dim = x.shape - rope_cache = rope_cache[0:seqlen] - assert rope_cache.shape == (seqlen, head_dim * 2) - return rope_cache.view(1, seqlen, 1, head_dim * 2).expand(bsz, -1, -1, -1) - - -def _rotate_half(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_emb_cos_sin( - xq: torch.Tensor, - xk: torch.Tensor, - rope_cache: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor]: - head_dim = xq.shape[-1] - rope_cache = reshape_for_broadcast_cos_sin(rope_cache, xq) - cos = rope_cache[..., :head_dim].to(device=xq.device) - sin = rope_cache[..., head_dim:].to(device=xq.device) - xq_f = xq.float() - xk_f = xk.float() - xq_out = (xq_f * cos) + (_rotate_half(xq_f) * sin) - xk_out = (xk_f * cos) + (_rotate_half(xk_f) * sin) - return xq_out.type_as(xq), xk_out.type_as(xk) - - -def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: - bs, slen, n_kv_heads, head_dim = x.shape - if n_rep == 1: - return x - return ( - torch.unsqueeze(x, dim=3) - .expand(bs, slen, n_kv_heads, n_rep, head_dim) - .reshape(bs, slen, n_kv_heads * n_rep, head_dim) - ) - - -def _to_activation_device(tensor: torch.Tensor, activation: torch.Tensor) -> torch.Tensor: - if tensor.device != activation.device and tensor.device.type == "meta": - return tensor.to(activation.device) - return tensor - - -def _rms_norm(x: torch.Tensor, norm: nn.RMSNorm) -> torch.Tensor: - weight = ( - _to_activation_device(norm.weight, x) - if norm.weight is not None - else None - ) - if weight is not None and weight.dtype != x.dtype: - weight = weight.to(dtype=x.dtype) - return F.rms_norm(x, norm.normalized_shape, weight, norm.eps).to(dtype=x.dtype) - - -def _linear(x: torch.Tensor, linear: nn.Linear) -> torch.Tensor: - weight = _to_activation_device(linear.weight, x) - bias = ( - _to_activation_device(linear.bias, x) - if linear.bias is not None - else None - ) - if weight.dtype != x.dtype: - weight = weight.to(dtype=x.dtype) - if bias is not None and bias.dtype != x.dtype: - bias = bias.to(dtype=x.dtype) - return F.linear(x, weight, bias) - - -class Attention(nn.Module): - def __init__(self, model_args: Qwen3ModelArgs): - super().__init__() - self.n_heads = model_args.n_heads - self.n_kv_heads = ( - model_args.n_heads - if model_args.n_kv_heads is None - else model_args.n_kv_heads - ) - self.n_rep = self.n_heads // self.n_kv_heads - self.head_dim = model_args.head_dim - self.scale = self.head_dim**-0.5 - - self.wq = nn.Linear( - model_args.dim, model_args.n_heads * self.head_dim, bias=False - ) - self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wo = nn.Linear( - model_args.n_heads * self.head_dim, model_args.dim, bias=False - ) - self.q_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps) - self.k_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps) - self.sdpa = build_attention(model_args.attn_mask_type) - - def init_weights(self, init_std: float): - for linear in (self.wq, self.wk, self.wv): - nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std) - self.q_norm.reset_parameters() - self.k_norm.reset_parameters() - - def forward( - self, - x: torch.Tensor, - freqs_cos_sin: torch.Tensor, - ): - bs, seqlen, _ = x.shape - xq, xk, xv = _linear(x, self.wq), _linear(x, self.wk), _linear(x, self.wv) - - xq = xq.view(bs, seqlen, -1, self.head_dim) - xk = xk.view(bs, seqlen, -1, self.head_dim) - xv = xv.view(bs, seqlen, -1, self.head_dim) - - xq = _rms_norm(xq, self.q_norm) - xk = _rms_norm(xk, self.k_norm) - freqs_cos_sin = _to_activation_device(freqs_cos_sin, xq) - xq, xk = apply_rotary_emb_cos_sin(xq, xk, freqs_cos_sin) - - keys = repeat_kv(xk, self.n_rep) - values = repeat_kv(xv, self.n_rep) - - xq = xq.transpose(1, 2) - xk = keys.transpose(1, 2) - xv = values.transpose(1, 2) - - output = self.sdpa(xq, xk, xv, scale=self.scale) - - output = output.transpose(1, 2).contiguous() - output = output.view(bs, seqlen, -1) - return _linear(output, self.wo) - - -class FeedForward(nn.Module): - def __init__(self, dim: int, hidden_dim: int): - super().__init__() - self.w1 = nn.Linear(dim, hidden_dim, bias=False) - self.w2 = nn.Linear(hidden_dim, dim, bias=False) - self.w3 = nn.Linear(dim, hidden_dim, bias=False) - - def forward(self, x): - return _linear(F.silu(_linear(x, self.w1)) * _linear(x, self.w3), self.w2) - - def init_weights(self, init_std: float): - nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) - for linear in (self.w2, self.w3): - nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) - - -class GroupedExperts(nn.Module): - def __init__( - self, - dim: int, - hidden_dim: int, - num_experts: int, - use_grouped_mm: bool, - ): - super().__init__() - self.num_experts = num_experts - self.w1 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim)) - self.w2 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim)) - self.w3 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim)) - self.use_grouped_mm = use_grouped_mm - - def forward( - self, - x: torch.Tensor, - num_tokens_per_expert: torch.Tensor, - ) -> torch.Tensor: - if self.use_grouped_mm: - return _run_experts_grouped_mm( - self.w1, self.w2, self.w3, x, num_tokens_per_expert - ) - return _run_experts_for_loop( - self.w1, self.w2, self.w3, x, num_tokens_per_expert - ) - - def init_weights(self, init_std: float): - nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02) - nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std) - nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std) - - -def _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name): - ep_size = axis_size(axis_name) - num_tokens_per_expert_group = all_to_all( - num_tokens_per_expert, - None, - None, - axis_name, - ) - - with torch.no_grad(): - input_splits = ( - num_tokens_per_expert.view(ep_size, -1) - .sum(dim=1) - .to(torch.device("cpu"), non_blocking=True) - ) - output_splits = ( - num_tokens_per_expert_group.view(ep_size, -1) - .sum(dim=1) - .to(torch.device("cpu"), non_blocking=False) - ) - input_splits = input_splits.tolist() - output_splits = output_splits.tolist() - - with fx_traceback.annotate({"comm_region": "token_dispatch"}): - routed_input = all_to_all( - routed_input, - output_splits, - input_splits, - axis_name, - ) - - num_local_experts = num_tokens_per_expert_group.shape[0] // ep_size - return ( - *_permute( - routed_input, - num_tokens_per_expert_group, - ep_size, - num_local_experts, - ), - input_splits, - output_splits, - ) - - -def qwen3_moe_local_mapped_region( - x: torch.Tensor, - selected_experts_indices: torch.Tensor, - top_scores: torch.Tensor, - experts_w1: torch.Tensor, - experts_w3: torch.Tensor, - experts_w2: torch.Tensor, - out: torch.Tensor, - top_k: int, - num_experts: int, - score_before_experts: bool, - axis_name: str, -) -> tuple[torch.Tensor, torch.Tensor]: - dim = x.shape[-1] - ep_size = axis_size(axis_name) - if num_experts % ep_size != 0: - raise ValueError( - f"num_experts ({num_experts}) must be divisible by " - f"axis_size({axis_name!r}) ({ep_size})." - ) - - num_tokens_per_expert = torch.histc( - selected_experts_indices.flatten(), - bins=num_experts, - min=0, - max=num_experts, - ).view(-1) - - token_indices_experts_sorted = torch.argsort( - selected_experts_indices.view(-1), stable=True - ) - top_scores_experts_sorted = top_scores.view(-1)[token_indices_experts_sorted] - token_indices_experts_sorted = token_indices_experts_sorted // top_k - - routed_input = x[token_indices_experts_sorted] - if score_before_experts: - routed_input = ( - routed_input.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1) - ).to(x.dtype) - - shape = routed_input.shape - ( - input_shape, - routed_input, - permuted_indices, - num_tokens_per_expert_group, - input_splits, - output_splits, - ) = _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name) - - routed_output = _run_experts_grouped_mm( - experts_w1, - experts_w2, - experts_w3, - routed_input, - num_tokens_per_expert_group, - ) - routed_output = _token_combine( - routed_output, - input_shape, - permuted_indices, - input_splits, - output_splits, - axis_name, - ) - - torch._check(routed_output.shape[0] == shape[0]) - if not score_before_experts: - routed_output = ( - routed_output.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1) - ).to(routed_output.dtype) - - out = out.scatter_add( - dim=0, - index=token_indices_experts_sorted.reshape(-1, 1).expand(-1, dim), - src=routed_output, - ) - return out, num_tokens_per_expert - -class MoE(nn.Module): - def __init__( - self, - model_args: Qwen3ModelArgs, - mesh: DeviceMesh | None = None, - axis_name: str | None = None, - ): - super().__init__() - self.mesh = mesh - self.axis_name = axis_name or model_args.moe_axis_name - self.num_experts = model_args.num_experts - self.top_k = model_args.top_k - self.route_norm = model_args.route_norm - self.route_scale = model_args.route_scale - self.score_before_experts = model_args.score_before_experts - self.load_balance_coeff = model_args.load_balance_coeff - - self.router = nn.Linear(model_args.dim, model_args.num_experts, bias=False) - self.experts = GroupedExperts( - dim=model_args.dim, - hidden_dim=model_args.moe_hidden_dim, - num_experts=model_args.num_experts, - use_grouped_mm=model_args.use_grouped_mm, - ) - self.register_buffer( - "expert_bias", - torch.zeros(model_args.num_experts, dtype=torch.float32), - persistent=self.load_balance_coeff is not None, - ) - self.register_buffer( - "tokens_per_expert", - torch.zeros(model_args.num_experts, dtype=torch.float32), - persistent=False, - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - bs, slen, dim = x.shape - x = x.view(-1, dim) - experts_w1, experts_w2, experts_w3 = self.experts.parameters() - experts_w1 = _to_activation_device(experts_w1, x) - experts_w2 = _to_activation_device(experts_w2, x) - experts_w3 = _to_activation_device(experts_w3, x) - - scores = F.linear( - x.to(torch.float32), - _to_activation_device(self.router.weight, x).to(torch.float32), - None, - ) - scores = F.softmax(scores, dim=-1) - expert_bias = _to_activation_device(self.expert_bias, scores) - scores_for_choice = ( - scores + expert_bias - if self.load_balance_coeff is not None - else scores - ) - _, selected_experts_indices = torch.topk( - scores_for_choice, - k=self.top_k, - dim=-1, - sorted=False, - ) - - top_scores = scores.gather(dim=-1, index=selected_experts_indices) - if self.route_norm: - denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20 - top_scores = top_scores / denominator - top_scores = top_scores * self.route_scale - - # Qwen3 MoE has no shared expert path, but keeping the initial output - # differentiably tied to x matches the DSv3 local_map autograd shape. - out = x * 0 - out, num_tokens_per_expert = local_map( - qwen3_moe_local_mapped_region, - out_placements=( - (Shard(0), Shard(0)), - (Partial(reduce_op="sum"), Partial(reduce_op="sum")), - ), - in_placements=( - (Shard(0), Shard(0)), - (Shard(0), Shard(0)), - (Shard(0), Shard(0)), - (Replicate(), Shard(0)), - (Replicate(), Shard(0)), - (Replicate(), Shard(0)), - (Shard(0), Shard(0)), - None, - None, - None, - None, - ), - redistribute_inputs=True, - in_grad_placements=None, - device_mesh=self.mesh, - )( - x, - selected_experts_indices, - top_scores, - experts_w1, - experts_w3, - experts_w2, - out, - self.top_k, - self.num_experts, - self.score_before_experts, - self.axis_name, - ) - # This counter is only used for runtime load-balance diagnostics. During - # AutoParallel graph capture the module buffers are fake/meta tensors - # while the traced local_map output can be CUDA-fake, and recording this - # mutation is not needed for the solved training graph. - if not torch.compiler.is_compiling(): - with torch.no_grad(): - self.tokens_per_expert.add_(num_tokens_per_expert) # type: ignore[operator] - return out.reshape(bs, slen, dim) - - def init_weights( - self, - init_std: float, - buffer_device: torch.device, - ): - nn.init.trunc_normal_(self.router.weight, mean=0.0, std=init_std) - self.experts.init_weights(init_std) - with torch.device(buffer_device): - self.tokens_per_expert.zero_() # type: ignore[operator] - self.expert_bias.zero_() # type: ignore[operator] - - -class TransformerBlock(nn.Module): - def __init__( - self, - layer_id: int, - model_args: Qwen3ModelArgs, - mesh: DeviceMesh | None = None, - moe_axis_name: str | None = None, - ): - super().__init__() - self.attention = Attention(model_args) - self.moe_enabled = model_args.moe_enabled - if self.moe_enabled: - self.moe = MoE(model_args, mesh=mesh, axis_name=moe_axis_name) - else: - self.feed_forward = FeedForward( - dim=model_args.dim, - hidden_dim=model_args.hidden_dim, - ) - self.attention_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps) - self.ffn_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps) - - if model_args.depth_init: - self.weight_init_std = 0.02 / math.sqrt(2 * (layer_id + 1)) - else: - self.weight_init_std = 0.02 / math.sqrt(2 * model_args.n_layers) - - def forward( - self, - x: torch.Tensor, - freqs_cos_sin: torch.Tensor, - ): - h = x + self.attention(_rms_norm(x, self.attention_norm), freqs_cos_sin) - if self.moe_enabled: - out = h + self.moe(_rms_norm(h, self.ffn_norm)) - else: - out = h + self.feed_forward(_rms_norm(h, self.ffn_norm)) - return out - - def init_weights(self, buffer_device: torch.device): - for norm in (self.attention_norm, self.ffn_norm): - norm.reset_parameters() - self.attention.init_weights(self.weight_init_std) - if self.moe_enabled: - self.moe.init_weights(self.weight_init_std, buffer_device) - else: - self.feed_forward.init_weights(self.weight_init_std) - - -class Transformer(nn.Module): - def __init__( - self, - model_args: Qwen3ModelArgs, - mesh: DeviceMesh | None = None, - moe_axis_name: str | None = None, - ): - super().__init__() - self.model_args = model_args - self.vocab_size = model_args.vocab_size - self.n_layers = model_args.n_layers - self.eos_id = model_args.eos_id - self.enable_weight_tying = model_args.enable_weight_tying - self.mesh = mesh - self.moe_axis_name = moe_axis_name or model_args.moe_axis_name - - self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) - self.register_buffer( - "freqs_cos_sin", - self._precompute_freqs_cos_sin(), - persistent=True, - ) - - self.layers = torch.nn.ModuleDict() - for layer_id in range(model_args.n_layers): - self.layers[str(layer_id)] = TransformerBlock( - layer_id, - model_args, - mesh=mesh, - moe_axis_name=self.moe_axis_name, - ) - self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps) - self.lm_head = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) - - if self.enable_weight_tying: - self.tok_embeddings.weight = self.lm_head.weight - - def init_weights( - self, - buffer_device: Optional[torch.device] = None, - seed: int | None = None, - ): - if seed is not None: - torch.manual_seed(seed) - - if self.enable_weight_tying: - self.tok_embeddings.weight = self.lm_head.weight - - buffer_device = buffer_device or self.freqs_cos_sin.device # type: ignore[assignment] - with torch.device(buffer_device): # type: ignore[arg-type] - self.freqs_cos_sin = self._precompute_freqs_cos_sin() - - if not self.enable_weight_tying and self.tok_embeddings is not None: - nn.init.normal_(self.tok_embeddings.weight) - for layer in self.layers.values(): - if layer is not None: - layer.init_weights(buffer_device) # type: ignore[operator] - if self.norm is not None: - self.norm.reset_parameters() - - final_out_std = self.model_args.dim**-0.5 - cutoff_factor = 3 - if self.lm_head is not None: - nn.init.trunc_normal_( - self.lm_head.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - - if self.enable_weight_tying: - self.tok_embeddings.weight = self.lm_head.weight - - def _precompute_freqs_cos_sin(self) -> torch.Tensor: - return precompute_freqs_cos_sin( - self.model_args.head_dim, - self.model_args.max_seq_len, - self.model_args.rope_theta, - ) - - def _token_embedding(self, tokens: torch.Tensor) -> torch.Tensor: - weight = self.tok_embeddings.weight - if weight.device != tokens.device and weight.device.type == "meta": - weight = weight.to(tokens.device) - return F.embedding(tokens, weight) - - def forward(self, tokens: torch.Tensor, input_batch: Optional[torch.Tensor] = None): - h = self._token_embedding(tokens) if self.tok_embeddings is not None else tokens - - for layer in self.layers.values(): - h = layer(h, self.freqs_cos_sin) - - h = _rms_norm(h, self.norm) if self.norm is not None else h - output = _linear(h, self.lm_head) if self.lm_head is not None else h - return output - - -_MODULE_FQN = "module_fqn" - - -def _annotate_once(fn: Callable, meta: dict): - if getattr(fn, "_graph_trainer_annotated", False): - return fn - wrapped = fx_traceback.annotate_fn(meta)(fn) - setattr(wrapped, "_graph_trainer_annotated", True) - return wrapped - - -def _annotate_module_fqns(model: nn.Module) -> None: - for fqn, submodule in model.named_modules(): - if fqn: - submodule.forward = _annotate_once( - submodule.forward, - {_MODULE_FQN: fqn}, - ) - - -def annotate_qwen3_for_graph_trainer(model: Transformer) -> None: - """Attach graph_trainer-compatible FX annotations to AP's Qwen3 model.""" - global qwen3_moe_local_mapped_region - - qwen3_moe_local_mapped_region = _annotate_once( - qwen3_moe_local_mapped_region, - {"EP": "compute"}, - ) - MoE.forward = _annotate_once( # type: ignore[method-assign] - MoE.forward, - {"EP": "compute"}, - ) - _annotate_module_fqns(model) diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py index 4c9167d4..c9474746 100644 --- a/autoparallel/serialization.py +++ b/autoparallel/serialization.py @@ -193,8 +193,7 @@ def save_optimizer(opt, path): "dv_costs_keys": dv_costs_keys, "dv_costs_vals": dv_costs_vals, "cluster_links_node_by_name": { - opt.nodes[c].name: opt.nodes[r].name - for c, r in opt.cluster_links.items() + opt.nodes[c].name: opt.nodes[r].name for c, r in opt.cluster_links.items() }, "constraint_log": opt._constraint_log, "selected_keys_by_name": selected_keys_by_name, diff --git a/examples/_bench_3d_cert.py b/examples/_bench_3d_cert.py deleted file mode 100644 index 956489cb..00000000 --- a/examples/_bench_3d_cert.py +++ /dev/null @@ -1,154 +0,0 @@ -"""3D optimality certificate for the merged solver on full LLaMA3-1B. - -The 3D ILP has ~8M binary variables; the exact CBC solve (and even CBC's LP -relaxation) is impractical (a 2.6 GB MPS file; CBC simplex runs for hours). The -LP relaxation is empirically integral for this problem (verified on 2D, where it -equals the exact optimum), so its objective is a tight lower bound on the ILP -optimum. We solve that LP with HiGHS (scipy.optimize.linprog), which handles the -8M-variable sparse LP in minutes, then compare to the approximate solvers. - -One full PuLP build feeds: the HiGHS LP lower bound (optimality reference), and -the prune+dp / merged approximate objectives. Reports the certified gaps. Env: -MESH, SEQLEN. -""" -import logging -import os -import time -from unittest.mock import patch - -import numpy as np -import pulp -import scipy.sparse as sp -import torch -from scipy.optimize import linprog -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - - -def log(m=""): - print(m, flush=True) - - -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] - - -def model_fn(): - args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, - multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -def lp_lower_bound_highs(opt): - """Solve the LP relaxation (binaries -> [0,1]) of opt.prob with HiGHS and - return its objective: a tight lower bound on the ILP optimum.""" - variables = opt.prob.variables() - idx = {v.name: i for i, v in enumerate(variables)} - n = len(variables) - c = np.zeros(n) - for v, coeff in opt.prob.objective.items(): - c[idx[v.name]] += coeff - rows_eq, cols_eq, data_eq, b_eq = [], [], [], [] - rows_ub, cols_ub, data_ub, b_ub = [], [], [], [] - r_eq = r_ub = 0 - for con in opt.prob.constraints.values(): - rhs = -con.constant - items = list(con.items()) - if con.sense == pulp.LpConstraintEQ: - for v, coeff in items: - rows_eq.append(r_eq); cols_eq.append(idx[v.name]); data_eq.append(coeff) - b_eq.append(rhs); r_eq += 1 - else: # LE: a<=b ; GE: a>=b -> -a<=-b - sign = 1.0 if con.sense == pulp.LpConstraintLE else -1.0 - for v, coeff in items: - rows_ub.append(r_ub); cols_ub.append(idx[v.name]); data_ub.append(sign * coeff) - b_ub.append(sign * rhs); r_ub += 1 - A_eq = sp.csr_matrix((data_eq, (rows_eq, cols_eq)), shape=(r_eq, n)) if r_eq else None - A_ub = sp.csr_matrix((data_ub, (rows_ub, cols_ub)), shape=(r_ub, n)) if r_ub else None - res = linprog(c, A_ub=A_ub, b_ub=(b_ub or None), A_eq=A_eq, b_eq=(b_eq or None), - bounds=(0, 1), method="highs") - if not res.success: - raise RuntimeError(f"HiGHS LP failed: {res.message}") - return res.fun, n, r_eq + r_ub - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -x = (Shard(0),) + (Replicate(),) * (ndim - 1) -out = (Shard(0), Shard(2)) if ndim == 2 else x - -log(f"=== 3D cert (HiGHS): LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===") -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") -autop.__enter__() -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x]) -autop.add_output_constraints([out]) -opt = autop.sharding_optimizer -opt._set_objective() -opt._apply_memory_constraint() -log(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)} " - f"pulp_vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)}") - -# prune+dp (approx, no annotation) on the same problem. -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -prune_dp = opt.profile["approximate"]["objective"] -log(f"[prune+dp] approx {time.perf_counter()-t:.1f}s objective={prune_dp:.1f}") - -# merged (prune+dp+annotated): propagate the TP plan, then approx-solve. -cp = (None,) * (ndim - 1) + (Shard(0),) -rp = (None,) * (ndim - 1) + (Shard(1),) -for proj in ["wq", "wk", "wv"]: - autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp) -autop.annotate_parameter("layers.*.attention.wo.weight", rp) -for proj in ["w1", "w3"]: - autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp) -autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp) -autop.propagate_annotations(verbose=False, method="fix") -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -merged = opt.profile["approximate"]["objective"] -log(f"[merged] approx {time.perf_counter()-t:.1f}s objective={merged:.1f}") - -# LP relaxation lower bound via HiGHS = optimality reference. -t = time.perf_counter() -lb, nvar, ncon = lp_lower_bound_highs(opt) -log(f"[LP-bound] HiGHS {time.perf_counter()-t:.1f}s lower_bound={lb:.1f} " - f"(vars={nvar} cons={ncon})") - -log("") -for name, obj in [("prune+dp", prune_dp), ("merged", merged)]: - gap = 100 * (obj - lb) / lb - log(f"=== 3D {name:<9} gap = {gap:+.2f}% (obj {obj:.1f} vs LP lower bound " - f"{lb:.1f}) <=10%: {abs(gap)<=10} <=5%: {abs(gap)<=5} ===") diff --git a/examples/_bench_anno.py b/examples/_bench_anno.py deleted file mode 100644 index 45e546ff..00000000 --- a/examples/_bench_anno.py +++ /dev/null @@ -1,116 +0,0 @@ -"""prune+dp+annotation (the full joint config) vs prune+dp alone, compared to a -known optimum/LP lower bound. Lite build + optional TP-plan annotation + approx. -Env: MODEL, MESH, SEQLEN, LP_LB.""" -import logging -import os -import time -from unittest.mock import patch - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "70b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -LP_LB = float(os.environ.get("LP_LB", "0")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, - max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),) -ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),) - - -def annotate_tp_plan(autop): - for proj in ["wq", "wk", "wv"]: - autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL) - autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL) - for proj in ["w1", "w3"]: - autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL) - autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL) - - -def constrain(autop): - x = (Shard(0),) + (Replicate(),) * (ndim - 1) - out = (Shard(0), Shard(2)) if ndim == 2 else x - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([x]) - autop.add_output_constraints([out]) - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### anno MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True) - - -def gap(o): - return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan") - - -# prune+dp (no annotation) -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") -autop.__enter__() -constrain(autop) -build_s = time.perf_counter() - t -opt = autop.sharding_optimizer -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -dp_s = time.perf_counter() - t -obj_dp = opt.profile["approximate"]["objective"] -print(f"[dp] build={build_s:.1f}s approx={dp_s:.1f}s obj={obj_dp:.1f} gap={gap(obj_dp):+.2f}%", flush=True) - -# + annotation -t = time.perf_counter() -annotate_tp_plan(autop) -prop = autop.propagate_annotations(verbose=False, method="fix") -prop_s = time.perf_counter() - t -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -ann_s = time.perf_counter() - t -obj_ann = opt.profile["approximate"]["objective"] -print(f"[dp+anno] build={build_s:.1f}s propagate={prop_s:.1f}s approx={ann_s:.1f}s " - f"total={build_s+prop_s+ann_s:.1f}s obj={obj_ann:.1f} gap={gap(obj_ann):+.2f}% " - f"(pinned {prop.nodes_determined} nodes)", flush=True) -print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} dp_gap={gap(obj_dp):+.2f}% " - f"dp+anno_gap={gap(obj_ann):+.2f}% dp+anno_total={build_s+prop_s+ann_s:.1f}s", flush=True) diff --git a/examples/_bench_approx.py b/examples/_bench_approx.py deleted file mode 100644 index 272c47aa..00000000 --- a/examples/_bench_approx.py +++ /dev/null @@ -1,166 +0,0 @@ -"""Benchmark approximate solver vs ILP: objective + solve time. - -Setting: LLaMA3 (1b default) on a 2D (dp, tp) mesh with vocab parallelism and -the canonical example_llama3 constraints. Both solvers run on the SAME built -optimizer: approx first (it only fills varValues/objective via an idempotent -_set_objective), then a fresh CBC solve for the ILP. This avoids building the -(expensive) strategy graph twice. - -Env knobs: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN, -REPEATED (1|0), RUN_ILP (1|0), ILP_TIMEOUT (seconds, 0=unlimited). -""" -import logging -import os -import time -from unittest.mock import patch - -import pulp -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -_alog = logging.getLogger("autoparallel.approximate_sharding") -_alog.setLevel(logging.INFO) -_alog.addHandler(logging.StreamHandler()) - - -def log(msg): - print(msg, flush=True) - - -_PATCHES = [ - patch("torch.cuda.device_count", lambda: 8), - patch("torch.cuda.get_device_name", lambda *a, **k: "H100"), - patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)), - patch( - "torch.cuda.get_device_properties", - lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132} - )(), - ), -] -for p in _PATCHES: - p.start() - -MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b") -N_LAYERS = int(os.environ.get("N_LAYERS", "0")) -SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4))) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) -REPEATED = os.environ.get("REPEATED", "1") == "1" -RUN_ILP = os.environ.get("RUN_ILP", "1") == "1" -LP_BOUND = os.environ.get("LP_BOUND", "1") == "1" -ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "1200")) - -world_size = 1 -for d in MESH_SHAPE: - world_size *= d - -_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp"), - 4: ("dp", "cp", "tp", "ep")} -mesh_names = _NAMES[len(MESH_SHAPE)] -fake_store = FakeStore() -torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size) -mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", MESH_SHAPE, mesh_dim_names=mesh_names -) - -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -seqlen = SEQLEN - - -def model_fn(): - if MODEL_TYPE == "1b": - args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - elif MODEL_TYPE == "8b": - args = TransformerModelArgs( - dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - else: - raise ValueError(MODEL_TYPE) - if N_LAYERS: - args.n_layers = N_LAYERS - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) - -log(f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} " - f"world={world_size} seqlen={seqlen} repeated_subgraphs={REPEATED} " - f"ilp_timeout={ILP_TIMEOUT}") - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=REPEATED) -autop.__enter__() -ndim = mesh.ndim -x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1) -# vocab-parallel output only defined for 2D (matches example_llama3); otherwise -# constrain the output like the input. -out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x_sharding]) -autop.add_output_constraints([out_sharding]) -opt = autop.sharding_optimizer -log(f"[build] optimizer ready in {time.perf_counter() - t:.2f}s " - f"vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)} " - f"nodes={len(opt.nodes)}") - -# ---- APPROX ---- -t = time.perf_counter() -approx = ApproximateShardingSolver(opt) -approx.get_solution(verbose=True) -ap_t = time.perf_counter() - t -ap_obj = pulp.value(opt.prob.objective) -prof = opt.profile.get("approximate", {}) -log(f"\n[APPROX] objective={ap_obj:.2f} solve_time={ap_t:.3f}s") -log(f" groups={prof.get('groups')} sweeps={prof.get('sweeps')} " - f"build={prof.get('build_s'):.3f}s search={prof.get('solve_s'):.3f}s " - f"writeback={ap_t - prof.get('build_s', 0) - prof.get('solve_s', 0):.3f}s") - -# ---- LP relaxation lower bound (certified suboptimality upper bound) ---- -if LP_BOUND: - lb_res = opt.get_lower_bound(verbose=False) - lb = lb_res.objective - if lb and lb > 0: - cert = (ap_obj - lb) / lb - log(f"\n[LP-bound] lower_bound={lb:.2f} solve={lb_res.solve_s:.2f}s " - f"=> approx within {cert*100:.2f}% of optimum (certified upper bound)") - -# ---- ILP (fresh CBC solve on the same problem) ---- -if RUN_ILP: - opt._set_objective() # idempotent: objective already populated by approx - kw = {"msg": True} - if ILP_TIMEOUT > 0: - kw["timeLimit"] = ILP_TIMEOUT - log(f"\n[ILP] solving with CBC (timeLimit={ILP_TIMEOUT or 'none'})...") - t = time.perf_counter() - opt.prob.solve(pulp.PULP_CBC_CMD(**kw)) - ilp_t = time.perf_counter() - t - ilp_obj = pulp.value(opt.prob.objective) - status = pulp.LpStatus[opt.prob.status] - log(f"[ILP] objective={ilp_obj:.2f} solve_time={ilp_t:.3f}s status={status}") - - gap = (ap_obj - ilp_obj) / ilp_obj - log(f"\n=== objective gap = {gap*100:+.2f}% solve speedup = {ilp_t/ap_t:.1f}x ===") - log(f"=== within 20% ? {abs(gap) <= 0.20} (ILP status: {status}) ===") diff --git a/examples/_bench_approx_diag.py b/examples/_bench_approx_diag.py deleted file mode 100644 index 25de4d85..00000000 --- a/examples/_bench_approx_diag.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Diagnose the bare approx gap: is the factor graph FAITHFUL (scores the true -optimum correctly -> solver is at fault) or UNFAITHFUL (drops cost -> model is at -fault), and is the optimum REPRESENTABLE in the group choices (pruning)? - -Builds the ILP, solves it exactly with CBC, then checks whether the approx's own -machinery (total_objective + factor graph) reproduces the CBC optimum, and where -the approx's own solution differs. Env: MODEL, MESH, SEQLEN.""" -import logging -import os -import time -from collections import defaultdict -from unittest.mock import patch - -import pulp -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "1b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, - max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -def constrain(autop): - x = (Shard(0),) + (Replicate(),) * (ndim - 1) - out = (Shard(0), Shard(2)) if ndim == 2 else x - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([x]) - autop.add_output_constraints([out]) - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### diag MODEL={MODEL} mesh={MESH_SHAPE}{names} ###", flush=True) - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") -autop.__enter__() -constrain(autop) -opt = autop.sharding_optimizer -print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) - -opt._set_objective() -opt._apply_memory_constraint() -t = time.perf_counter() -opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"])) -obj_cbc = pulp.value(opt.prob.objective) -print(f"[cbc] solve={time.perf_counter()-t:.1f}s obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]}", flush=True) - -# CBC per-(root)node chosen out_idx -cbc_out = {} -for key, var in opt.pulp_variables.items(): - v = var.varValue - if v is not None and v > 0.5: - cbc_out[key[0]] = key[2] - -approx = ApproximateShardingSolver(opt) -approx._build_problem() -approx._build_factors() - -# (A) FAITHFULNESS: exact objective of the CBC solution via the approx machinery. -approx.cur_out = dict(cbc_out) -e_cbc_total = approx.total_objective() -print(f"[faithful] approx.total_objective(CBC soln) = {e_cbc_total:.1f} " - f"(CBC obj {obj_cbc:.1f}; match={abs(e_cbc_total-obj_cbc)<1.0})", flush=True) - -# (B) REPRESENTABILITY: can the group choices express the CBC solution? -cbc_full = dict(cbc_out) -for copy_idx, root_idx in opt.cluster_links.items(): - if root_idx in cbc_out: - cbc_full[copy_idx] = cbc_out[root_idx] -unrep = [] -cbc_group_choice = {} -for gid, g in enumerate(approx.groups): - found = None - for ci, choice in enumerate(g.choices): - if all(cbc_full.get(m) == o for m, o in choice.items()): - found = ci - break - if found is None: - unrep.append(gid) - else: - cbc_group_choice[gid] = found -print(f"[representable] groups={len(approx.groups)} " - f"with_no_matching_choice={len(unrep)}", flush=True) - -# (C) factor-graph energy of the CBC solution (if representable) -if not unrep: - for gid, ci in cbc_group_choice.items(): - approx._set_group(gid, ci) - fge = approx._fast_total_energy() - print(f"[fg-energy] _fast_total_energy(CBC soln) = {fge:.1f} " - f"(match CBC {abs(fge-obj_cbc)<1.0})", flush=True) - -# (D) run the normal approx, localize where it differs from CBC -approx2 = ApproximateShardingSolver(opt) -approx2.get_solution(verbose=False) -obj_approx = opt.profile["approximate"]["objective"] -ax_out = dict(approx2.cur_out) -print(f"[approx] obj={obj_approx:.1f} gap={100*(obj_approx-obj_cbc)/obj_cbc:+.2f}%", flush=True) - -# per-node exact cost under each assignment (cost_bearing nodes), to localize gap -def node_cost(solver, out_map, v): - o = out_map[v] - node = opt.nodes[v] - strat = opt.strats[node].strategies[o] - prod = solver._arg_prod.get(v, {}) - c = 0.0 - for argi in range(len(strat.redistribute_cost)): - p = prod.get(argi) - inp = out_map[p] if (p is not None and p in out_map) else 0 - key = (v, argi, o, inp) - dv = opt.decision_vars.get(key) - if dv is None: - return None - c += dv.cost - return solver.node_mult[v] * c - -diffs = [] -for v in approx2.cost_bearing: - if cbc_out.get(v) != ax_out.get(v): - c_cbc = node_cost(approx2, cbc_out, v) - c_ax = node_cost(approx2, ax_out, v) - if c_cbc is not None and c_ax is not None: - diffs.append((c_ax - c_cbc, v, opt.nodes[v].name, cbc_out.get(v), ax_out.get(v))) -diffs.sort(reverse=True) -print(f"[localize] {len(diffs)} cost-bearing nodes differ; top contributors (approx-cbc):", flush=True) -for d, v, name, oc, oa in diffs[:15]: - print(f" +{d:10.1f} node={name[:40]:40s} cbc_out={oc} approx_out={oa}", flush=True) -tot = sum(d for d, *_ in diffs) -print(f"[localize] total node-cost diff over differing nodes = {tot:.1f} " - f"(gap = {obj_approx-obj_cbc:.1f})", flush=True) diff --git a/examples/_bench_approx_ils.py b/examples/_bench_approx_ils.py deleted file mode 100644 index d6e1b437..00000000 --- a/examples/_bench_approx_ils.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Diagnose whether the approx solver's objective is stuck in a local-optimum -basin that a stronger search escapes. Build once, run the stock BP+localsearch, -then run iterated local search (perturb a random set of groups, re-optimize, -keep best) for a time budget. If ILS beats the stock objective meaningfully, the -gap is a move-set/init weakness (and the LP bound is ~reachable); if not, 607260 -is robust. Env: MODEL, MESH, SEQLEN, LP_LB, ILS_S.""" -import logging -import os -import random -import time -from unittest.mock import patch - -import numpy as np -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "70b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -LP_LB = float(os.environ.get("LP_LB", "0")) -ILS_S = float(os.environ.get("ILS_S", "180")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, - max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### ILS MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ils_s={ILS_S} ###", flush=True) - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") -autop.__enter__() -x = (Shard(0),) + (Replicate(),) * (ndim - 1) -out = (Shard(0), Shard(2)) if ndim == 2 else x -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x]) -autop.add_output_constraints([out]) -opt = autop.sharding_optimizer -print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) - - -def gap(o): - return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan") - - -s = ApproximateShardingSolver(opt) -s._build_problem() -s._build_factors() -G = len(s.groups) -domains = [g.domain for g in s.groups] -multi = [d for d in domains if d > 1] -edges = len(s.C) -print(f"[graph] groups={G} multi_choice_groups={len(multi)} " - f"max_domain={max(domains)} sum_domain={sum(domains)} pair_edges={edges}", flush=True) - -# Stock solve (BP + local search), mirrors _solve's BP candidate. -deadline = time.perf_counter() + 1e9 -s._belief_propagation() -s._memory_repair() -s._coordinate_descent(deadline) -s._star_block_search(deadline) -stock = s._fast_total_energy() -best = stock -best_snap = [g.current for g in s.groups] -print(f"[stock] bp+cd+star energy={stock:.1f} gap={gap(stock):+.2f}%", flush=True) - -# Iterated local search: perturb k random multi-choice groups, re-optimize, keep best. -rng = random.Random(0) -multi_gids = [g for g in range(G) if s.groups[g].domain > 1] -t0 = time.perf_counter() -iters = 0 -accepts = 0 -while time.perf_counter() - t0 < ILS_S: - iters += 1 - # restore best, then kick - for gid, ci in enumerate(best_snap): - s._set_group(gid, ci) - k = rng.randint(1, max(2, len(multi_gids) // 10)) - for gid in rng.sample(multi_gids, min(k, len(multi_gids))): - s._set_group(gid, rng.randrange(s.groups[gid].domain)) - s._memory_repair() - s._coordinate_descent(deadline) - s._star_block_search(deadline) - e = s._fast_total_energy() - if e < best - 1e-6: - best = e - best_snap = [g.current for g in s.groups] - accepts += 1 - print(f"[ils] iter={iters} NEW BEST energy={best:.1f} gap={gap(best):+.2f}% " - f"(k={k})", flush=True) - -for gid, ci in enumerate(best_snap): - s._set_group(gid, ci) -exact = s._write_back() -print(f"[ILS done] iters={iters} accepts={accepts} stock={stock:.1f} " - f"best={best:.1f} exact_obj={exact:.1f} gap={gap(exact):+.2f}% " - f"(improvement vs stock = {100*(stock-best)/stock:.2f}%)", flush=True) diff --git a/examples/_bench_approx_sweep.py b/examples/_bench_approx_sweep.py deleted file mode 100644 index 3d73a070..00000000 --- a/examples/_bench_approx_sweep.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Build one model (lite) once, then run ApproximateShardingSolver under several -hyperparameter configs to see whether the objective gap (vs a known LP lower -bound) is closable by tuning (candidate pruning / BP iters / time / local search) -or is structural. Env: MODEL, MESH, SEQLEN, LP_LB (reference lower bound).""" -import logging -import os -import time -from unittest.mock import patch - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "70b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -LP_LB = float(os.environ.get("LP_LB", "0")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs( - rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### approx sweep MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True) - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") -autop.__enter__() -x = (Shard(0),) + (Replicate(),) * (ndim - 1) -out = (Shard(0), Shard(2)) if ndim == 2 else x -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x]) -autop.add_output_constraints([out]) -opt = autop.sharding_optimizer -print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) - -CONFIGS = [ - ("default", dict()), - ("cand=256", dict(candidate_limit=256)), - ("cand=None", dict(candidate_limit=None)), - ("bp=100", dict(bp_iters=100)), - ("sweeps=200,star=20,t=600", dict(max_sweeps=200, star_passes=20, max_time_s=600)), - ("star_children=64,domain=4096", dict(max_star_children=64, group_domain_limit=4096)), - ("ALL generous", dict(candidate_limit=None, bp_iters=100, max_sweeps=200, - star_passes=20, max_time_s=900, max_star_children=64, - group_domain_limit=4096)), -] - -best = None -for name, cfg in CONFIGS: - t = time.perf_counter() - solver = ApproximateShardingSolver(opt, **cfg) - solver.get_solution(verbose=False) - dt = time.perf_counter() - t - ap = opt.profile["approximate"] - obj = ap["objective"] - gap = 100 * (obj - LP_LB) / LP_LB if LP_LB else float("nan") - winner = "bp" if ap["bp_energy"] <= ap["greedy_energy"] else "greedy" - print(f"[cfg] {name:30s} obj={obj:.1f} gap={gap:+.2f}% " - f"bp={ap['bp_energy']:.1f} greedy={ap['greedy_energy']:.1f} win={winner} " - f"t={dt:.1f}s", flush=True) - if best is None or obj < best[1]: - best = (name, obj) - -print(f"[BEST] {best[0]} obj={best[1]:.1f} " - f"gap={100*(best[1]-LP_LB)/LP_LB:+.2f}%" if LP_LB else f"[BEST] {best[0]} obj={best[1]:.1f}", - flush=True) diff --git a/examples/_bench_build_profile.py b/examples/_bench_build_profile.py deleted file mode 100644 index 03b6a2c9..00000000 --- a/examples/_bench_build_profile.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Dump the lite-build phase breakdown (tracing vs strategy enumeration vs -decision-var cost estimation) for LLaMA3-1B on a 3D mesh, to see where the -~615s build time goes. Env: MESH, SEQLEN.""" -import json -import logging -import os -import time -from unittest.mock import patch - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -if os.environ.get("DEBUG_CLUSTER") == "1": - h = logging.StreamHandler() - h.setLevel(logging.DEBUG) - for nm in ("autoparallel.graph_passes.graph_clustering", "autoparallel.optimize_sharding"): - lg = logging.getLogger(nm) - lg.setLevel(logging.DEBUG) - lg.addHandler(h) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "1b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] - - -def model_fn(): - args = TransformerModelArgs( - rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"=== build profile: MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True) - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") -autop.__enter__() -enter_s = time.perf_counter() - t -opt = autop.sharding_optimizer -tm = opt.profile["timings"] -init = tm.get("init_total_s", 0.0) -tracing = enter_s - init # __enter__ = tracing + ShardingOptimizer construction - -print(json.dumps({ - "enter_total_s": round(enter_s, 1), - "tracing_s (enter - optimizer_init)": round(tracing, 1), - "optimizer_init_total_s": round(init, 1), - " strategy_enumeration_s": round(tm.get("strategy_enumeration_s", 0), 1), - " decision_var_build_s": round(tm.get("decision_var_build_s", 0), 1), - " compute_cost_estimation_s": round(tm.get("compute_cost_estimation_s", 0), 1), - " edge_cost_estimation_s": round(tm.get("edge_cost_estimation_s", 0), 1), - " pulp_var_creation_s (0 in lite)": round(tm.get("pulp_var_creation_s", 0), 1), - " validation_s": round(tm.get("validation_s", 0), 1), - "decision_vars": len(opt.decision_vars), - "graph_nodes": opt.profile["model"]["graph_nodes"], - "strategy_options": opt.profile["strategies"]["strategy_options"], - "option_tuples (edges)": opt.profile["strategies"]["option_tuples"], -}, indent=2), flush=True) diff --git a/examples/_bench_build_verify.py b/examples/_bench_build_verify.py deleted file mode 100644 index 08fea734..00000000 --- a/examples/_bench_build_verify.py +++ /dev/null @@ -1,92 +0,0 @@ -"""A/B verify that the fast build (AP_FAST_BUILD=1) produces byte-identical -decision_vars + approx objective as the baseline (AP_FAST_BUILD=0), and report -build time. Run the same MESH/MODEL with both env values and diff the dv_hash. -Env: MESH, SEQLEN, MODEL (tiny|1b).""" -import hashlib -import logging -import os -import time -from unittest.mock import patch - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MODEL = os.environ.get("MODEL", "tiny") -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "4,2").split(",")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128 if MODEL == "tiny" else 128256 -batch_size = 2 * mesh.shape[0] - - -def model_fn(): - if MODEL == "tiny": - args = TransformerModelArgs(dim=64, n_layers=2, n_heads=4, n_kv_heads=2, - vocab_size=vocab_size, multiple_of=32, - rope_theta=500000, max_seq_len=SEQLEN) - else: - args = TransformerModelArgs(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.5, multiple_of=256, - rope_theta=500000, vocab_size=vocab_size, - max_seq_len=SEQLEN) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -x = (Shard(0),) + (Replicate(),) * (ndim - 1) -out = (Shard(0), Shard(2)) if ndim == 2 else x - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") -autop.__enter__() -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x]) -autop.add_output_constraints([out]) -build_s = time.perf_counter() - t -opt = autop.sharding_optimizer - -# Canonical, exact dump of every decision var's costs. -items = [] -for key in sorted(opt.decision_vars.keys()): - dv = opt.decision_vars[key] - items.append((key, repr(dv.cost), repr(dv.comm_cost), repr(dv.compute_cost), - repr(dv.sharding_transition_cost))) -dv_hash = hashlib.sha256(repr(items).encode()).hexdigest() - -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -approx_s = time.perf_counter() - t -obj = opt.profile["approximate"]["objective"] - -print(f"AP_FAST_BUILD={os.environ.get('AP_FAST_BUILD', '1')} MODEL={MODEL} " - f"MESH={MESH_SHAPE} build={build_s:.2f}s approx={approx_s:.2f}s " - f"n_dv={len(opt.decision_vars)} dv_hash={dv_hash[:32]} " - f"approx_obj={obj!r}", flush=True) diff --git a/examples/_bench_dp_alone.py b/examples/_bench_dp_alone.py deleted file mode 100644 index 4b67c3a1..00000000 --- a/examples/_bench_dp_alone.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Minimal approx-solver timing, for the 'dp alone' (approx WITHOUT prune) -baseline. Run it with PYTHONPATH pointing at the dp_solver checkout to get the -unpruned numbers, and at the merge checkout to cross-check prune+dp. - -Reports lite-build time, approx solve time, decision-var count and objective for -LLaMA3-1B with the canonical constraints. Env: MESH, SEQLEN, N_LAYERS. -""" -import logging -import os -import time -from unittest.mock import patch - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -import autoparallel -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -N_LAYERS = int(os.environ.get("N_LAYERS", "0")) -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] - - -def model_fn(): - args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, - multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN) - if N_LAYERS: - args.n_layers = N_LAYERS - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -x = (Shard(0),) + (Replicate(),) * (ndim - 1) -out = (Shard(0), Shard(2)) if ndim == 2 else x - -print(f"autoparallel = {autoparallel.__file__}", flush=True) -print(f"=== dp-alone (approx) LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} " - f"layers={N_LAYERS or 16} ===", flush=True) - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") -autop.__enter__() -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x]) -autop.add_output_constraints([out]) -t_build = time.perf_counter() - t -opt = autop.sharding_optimizer - -# With MERGED=1, add the propagated TP plan before solving (full joint solver). -t_prop = 0.0 -label = "dp-alone" -if os.environ.get("MERGED") == "1": - label = "merged" - cp = (None,) * (ndim - 1) + (Shard(0),) - rp = (None,) * (ndim - 1) + (Shard(1),) - for proj in ["wq", "wk", "wv"]: - autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp) - autop.annotate_parameter("layers.*.attention.wo.weight", rp) - for proj in ["w1", "w3"]: - autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp) - autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp) - t = time.perf_counter() - autop.propagate_annotations(verbose=False, method="fix") - t_prop = time.perf_counter() - t - -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -t_solve = time.perf_counter() - t -obj = opt.profile["approximate"]["objective"] - -print(f"[{label}] build={t_build:.2f}s propagate={t_prop:.2f}s " - f"approx_solve={t_solve:.2f}s total={t_build + t_prop + t_solve:.2f}s " - f"obj={obj:.1f} decision_vars={len(opt.decision_vars)}", flush=True) diff --git a/examples/_bench_lp_3d.py b/examples/_bench_lp_3d.py deleted file mode 100644 index 5b08840b..00000000 --- a/examples/_bench_lp_3d.py +++ /dev/null @@ -1,107 +0,0 @@ -"""Benchmark LP-relaxation solve time for LLaMA3 on a 3D mesh.""" -import logging -import os -import time - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.WARNING) - -MODEL_TYPE = os.environ.get("MODEL_TYPE", "8b") -N_LAYERS = int(os.environ.get("N_LAYERS", "0")) # 0 => use default for model -SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4))) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -MESH_NAMES = ("dp", "cp", "tp") - -world_size = 1 -for d in MESH_SHAPE: - world_size *= d - -fake_store = FakeStore() -torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size) - -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=MESH_NAMES) - -batch_size = 2 * mesh.shape[0] -seqlen = SEQLEN -vocab_size = 128256 -device = torch.device("cuda") - - -def model_fn(): - if MODEL_TYPE == "1b": - args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - elif MODEL_TYPE == "8b": - args = TransformerModelArgs( - dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - elif MODEL_TYPE == "70b": - args = TransformerModelArgs( - dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, - ffn_dim_multiplier=1.3, multiple_of=4096, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - else: - raise ValueError(MODEL_TYPE) - if N_LAYERS: - args.n_layers = N_LAYERS - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, seqlen), device=device) - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) - -with torch.device("meta"): - model = model_fn() - -mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) - -print(f"=== model={MODEL_TYPE} n_layers={model.model_args.n_layers} " - f"mesh={MESH_SHAPE}{MESH_NAMES} world_size={world_size} ===") - -print("[build] entering AutoParallel (graph export + strategy enumeration)...", flush=True) -t_build = time.perf_counter() -with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True) as autop: - print(f"[build] AutoParallel ready in {time.perf_counter() - t_build:.2f} s", flush=True) - autop.add_parameter_memory_constraint(low=None, high=None) - x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1) - autop.add_input_constraints([x_sharding]) - autop.add_output_constraints([x_sharding]) - print(f"[build+constraints] {time.perf_counter() - t_build:.2f} s") - - opt = autop.sharding_optimizer - print(f"[problem] unique_vars={len(opt.pulp_variables)} " - f"constraints={len(opt.prob.constraints)}", flush=True) - - mode = os.environ.get("SOLVE_MODE", "lp") # lp | ilp | both - - if mode in ("lp", "both"): - res = opt.get_lower_bound(verbose=False) - print(f"[LP relaxation] status={res.status} objective={res.objective:.4f}") - print(f"[LP relaxation] solve_s={res.solve_s:.3f} total_s={res.total_s:.3f}", flush=True) - - if mode in ("ilp", "both"): - print("[ILP] solving (this may take a long time)...", flush=True) - t_ilp = time.perf_counter() - opt.get_solution(verbose=True) - import pulp - obj = pulp.value(opt.prob.objective) - print(f"[ILP] status={pulp.LpStatus[opt.prob.status]} objective={obj}") - print(f"[ILP] solve+extract_s={time.perf_counter() - t_ilp:.3f}", flush=True) diff --git a/examples/_bench_lp_integrality.py b/examples/_bench_lp_integrality.py deleted file mode 100644 index 1c95b7e1..00000000 --- a/examples/_bench_lp_integrality.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Re-solve the 70B LP relaxation and report how integral the optimum is: count -fractional variables in the HiGHS solution. If ~all variables are 0/1, the LP -optimum is reachable by integers (so an approx gap is a real solver failure); if -many are fractional, the LP bound is loose (and the approx may be near-optimal). -Also reports the objective with the memory constraint dropped, to test whether -the memory budget is the fractionality source. Env: MODEL, MESH, SEQLEN.""" -import logging -import os -import time -from unittest.mock import patch - -import numpy as np -import pulp -import scipy.sparse as sp -import torch -from scipy.optimize import linprog -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "70b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -DROP_MEM = os.environ.get("DROP_MEM", "0") == "1" -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, - max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### LP integrality MODEL={MODEL} mesh={MESH_SHAPE}{names} drop_mem={DROP_MEM} ###", flush=True) - -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") -autop.__enter__() -x = (Shard(0),) + (Replicate(),) * (ndim - 1) -out = (Shard(0), Shard(2)) if ndim == 2 else x -autop.add_parameter_memory_constraint(low=None, high=None) -autop.add_input_constraints([x]) -autop.add_output_constraints([out]) -opt = autop.sharding_optimizer -print(f"[build] full_build={time.perf_counter()-t:.1f}s", flush=True) - -opt._set_objective() -if not DROP_MEM: - opt._apply_memory_constraint() -variables = opt.prob.variables() -vidx = {id(v): i for i, v in enumerate(variables)} -n = len(variables) -c = np.zeros(n) -for key, dv in opt.decision_vars.items(): - mult = 1 + len(opt._root_to_copies.get(key[0], ())) - c[vidx[id(dv.var)]] += dv.cost * mult -re = ru = 0 -reqr, reqc, reqd, beq = [], [], [], [] -rubr, rubc, rubd, bub = [], [], [], [] -for con in opt.prob.constraints.values(): - rhs = -con.constant - if con.sense == pulp.LpConstraintEQ: - for v, co in con.items(): - reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co) - beq.append(rhs); re += 1 - else: - sgn = 1.0 if con.sense == pulp.LpConstraintLE else -1.0 - for v, co in con.items(): - rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(sgn * co) - bub.append(sgn * rhs); ru += 1 -A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None -A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None -t = time.perf_counter() -res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None), - bounds=(0, 1), method="highs-ds", options={"disp": True}) -print(f"[lp] solve={time.perf_counter()-t:.1f}s status={res.message}", flush=True) -xv = res.x -freq = np.abs(xv - np.round(xv)) -nfrac = int((freq > 1e-6).sum()) -nfrac4 = int((freq > 1e-4).sum()) -# weight fractionality by objective contribution to see if it matters -frac_obj = float(np.abs(c * freq).sum()) -print(f"[RESULT] MODEL={MODEL} drop_mem={DROP_MEM} obj={res.fun:.1f} " - f"vars={n} fractional(>1e-6)={nfrac} ({100*nfrac/n:.4f}%) " - f"fractional(>1e-4)={nfrac4} frac_obj_weight={frac_obj:.1f}", flush=True) diff --git a/examples/_bench_mem_lagrangian.py b/examples/_bench_mem_lagrangian.py deleted file mode 100644 index 6166a552..00000000 --- a/examples/_bench_mem_lagrangian.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Compare the Lagrangian memory-constrained approximate solve against the LP -(relaxation) optimum across a sweep of parameter-memory budgets. - -The optimizer (the expensive build) is constructed ONCE; each budget only -re-runs the cheap solves. For every budget factor `high` (with low=0): - - LP: set the memory constraint and solve the (integral) relaxation -> the - exact constrained optimum (gold standard). - - Lagrangian approx: fold lambda * ratio into the unaries and bisect lambda - until the achieved memory lands in the same [low, high] budget. -The two solvers are pinned to the SAME numeric budget (read back from the LP's -constraint rows) so the comparison is apples-to-apples. - -Env: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN, -HIGH_FACTORS (comma list, default sweep), BP_ITERS. -""" -import logging -import os -import time -from unittest.mock import patch - -import pulp -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) - - -def log(msg): - print(msg, flush=True) - - -_PATCHES = [ - patch("torch.cuda.device_count", lambda: 8), - patch("torch.cuda.get_device_name", lambda *a, **k: "H100"), - patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)), - patch( - "torch.cuda.get_device_properties", - lambda *a, **k: type( - "P", - (), - { - "major": 9, - "minor": 0, - "name": "H100", - "total_memory": 80 * 1024**3, - "multi_processor_count": 132, - }, - )(), - ), -] -for p in _PATCHES: - p.start() - -MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b") -N_LAYERS = int(os.environ.get("N_LAYERS", "0")) -SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4))) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) -BP_ITERS = int(os.environ.get("BP_ITERS", "120")) -HIGH_FACTORS = [ - float(x) - for x in os.environ.get( - "HIGH_FACTORS", "0.0156,0.03125,0.0625,0.125,0.25,0.5,1.0" - ).split(",") -] -# On budgets where the LP relaxation is fractional (its optimum is an -# unachievable lower bound) also solve the true ILP to report the achievable gap. -RUN_ILP = os.environ.get("RUN_ILP", "0") == "1" -ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "300")) - -world_size = 1 -for d in MESH_SHAPE: - world_size *= d - -_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp")} -mesh_names = _NAMES[len(MESH_SHAPE)] -fake_store = FakeStore() -torch.distributed.init_process_group( - "fake", store=fake_store, rank=0, world_size=world_size -) -mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", MESH_SHAPE, mesh_dim_names=mesh_names -) - -vocab_size = 128256 -batch_size = int(os.environ.get("BATCH", str(2 * mesh.shape[0]))) -seqlen = SEQLEN - - -def model_fn(): - args = TransformerModelArgs( - dim=2048, - n_layers=16, - n_heads=32, - n_kv_heads=8, - ffn_dim_multiplier=1.5, - multiple_of=256, - rope_theta=500000, - vocab_size=vocab_size, - max_seq_len=seqlen, - ) - if MODEL_TYPE == "8b": - args = TransformerModelArgs( - dim=4096, - n_layers=32, - n_heads=32, - n_kv_heads=8, - ffn_dim_multiplier=1.3, - multiple_of=1024, - rope_theta=500000, - vocab_size=vocab_size, - max_seq_len=seqlen, - ) - if N_LAYERS: - args.n_layers = N_LAYERS - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) - -log( - f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} " - f"world={world_size} seqlen={seqlen} bp_iters={BP_ITERS}" -) - -# ---- build once ---- -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True) -autop.__enter__() -ndim = mesh.ndim -x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1) -out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding -# Build with a LOOSE budget so the approx build does not pin params to the -# min-ratio (fully-sharded) choices; the per-budget sweep overrides the budget -# numerically afterward. (A tight default would prune param strategies at build -# time and freeze the achievable memory.) -autop.add_parameter_memory_constraint(low=0.0, high=1.0) -autop.add_input_constraints([x_sharding]) -autop.add_output_constraints([out_sharding]) -opt = autop.sharding_optimizer -log( - f"[build] optimizer ready in {time.perf_counter() - t:.2f}s " - f"vars={len(opt.pulp_variables)} nodes={len(opt.nodes)}" -) - -# build the approximate solver once (ratios / factor graph / mem unary cached) -t = time.perf_counter() -approx = ApproximateShardingSolver(opt, bp_iters=BP_ITERS) -approx._build_problem() -approx._build_factors() -approx._build_mem_unary() -log( - f"[build] approx solver ready in {time.perf_counter() - t:.2f}s " - f"groups={len(approx.groups)} " - f"params={len(approx._memory['param_idxs']) if approx._memory else 0}" -) -opt._set_objective() - - -def lp_budget(): - """Read back the exact [low, high] the LP applied, so approx uses the same.""" - ch = opt.prob.constraints["memory_constraint_high"] - cl = opt.prob.constraints["memory_constraint_low"] - return -cl.constant, -ch.constant - - -log("\n" + "=" * 110) -log( - f"{'high_f':>8} | {'budget':>16} | {'LP obj':>12} {'frac':>7} {'LP s':>6} | " - f"{'approx obj':>12} {'mem':>7} {'lam':>9} {'feas':>5} {'s':>5} | " - f"{'gap/LP':>7} {'ILP obj':>12} {'gap/ILP':>8}" -) -log("-" * 110) - -rows = [] -for hf in HIGH_FACTORS: - opt._memory_constraint = (0.0, hf) - t = time.perf_counter() - lp = opt.solve_lp_relaxation(verbose=False, extract=False) - lp_s = time.perf_counter() - t - lp_obj = lp["objective"] - frac = f"{lp['n_fractional']}/{lp['n_vars']}" - blow, bhigh = lp_budget() - - approx._memory["budget_low"] = blow - approx._memory["budget_high"] = bhigh - approx._memory["tight"] = abs(bhigh - blow) < 1e-9 - t = time.perf_counter() - res = approx.solve_lagrangian(blow, bhigh, max_iter=24) - ap_s = time.perf_counter() - t - ap_obj = res["objective"] - gap = (ap_obj - lp_obj) / lp_obj * 100 if lp_obj else float("nan") - - ilp_obj, gap_ilp = None, None - if RUN_ILP and lp["n_fractional"] > 0: - opt._set_objective() - opt._apply_memory_constraint() - opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, timeLimit=ILP_TIMEOUT)) - ilp_obj = pulp.value(opt.prob.objective) - gap_ilp = (ap_obj - ilp_obj) / ilp_obj * 100 if ilp_obj else float("nan") - - rows.append((hf, lp_obj, ap_obj, gap, res["feasible"], ilp_obj, gap_ilp)) - log( - f"{hf:>8.4g} | [{blow:>6.3f},{bhigh:>7.3f}] | {lp_obj:>12.1f} {frac:>7} " - f"{lp_s:>5.1f}s | {ap_obj:>12.1f} {res['memory']:>7.3f} {res['lam']:>9.4g} " - f"{str(res['feasible']):>5} {ap_s:>4.1f}s | {gap:>+6.2f}% " - f"{('%.1f' % ilp_obj) if ilp_obj else '-':>12} " - f"{('%+.2f%%' % gap_ilp) if gap_ilp is not None else '-':>8}" - ) - -log("=" * 110) -gaps = [r[3] for r in rows if r[1]] -feas = [r[4] for r in rows] -if gaps: - log( - f"gap vs LP: mean={sum(gaps)/len(gaps):+.2f}% max={max(gaps):+.2f}% " - f"min={min(gaps):+.2f}% feasible={sum(feas)}/{len(feas)}" - ) -gi = [r[6] for r in rows if r[6] is not None] -if gi: - log( - f"gap vs ILP (fractional-LP budgets): mean={sum(gi)/len(gi):+.2f}% " - f"max={max(gi):+.2f}%" - ) diff --git a/examples/_bench_merge.py b/examples/_bench_merge.py deleted file mode 100644 index c6249021..00000000 --- a/examples/_bench_merge.py +++ /dev/null @@ -1,293 +0,0 @@ -"""Joint-optimization benchmark: prune (+ annotated) + dp (approx) vs each alone. - -Measures, for LLaMA3-1B on a 2D or 3D mesh with the canonical example_llama3 -constraints, four optimization configurations on the SAME traced model: - - prune : full ILP build + exact CBC solve (== prune_search_space) - annotated : full ILP build + propagate(fix) + CBC solve (== annotated_search) - dp : lite build + approx solve (== dp_solver) - merged : lite build + propagate(fix) + approx (this branch) - -Reports each config's build/solve/total time and objective, the LP-relaxation -lower bound (an optimality certificate), and checks the acceptance criteria: - - * merged objective within 10% (ideally 5%) of the ILP optimum, and - * merged total time < every individual optimization's total time. - -Env knobs: MESH ("8,8" 2D / "2,4,8" 3D), ILP_TIMEOUT (s, 0=unlimited), -N_LAYERS (0=default 16), SEQLEN. -""" -import logging -import os -import time -from unittest.mock import patch - -import pulp -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) - - -def log(msg=""): - print(msg, flush=True) - - -# Fake an 8-GPU H100 node so the cost model runs without real GPUs. -_PATCHES = [ - patch("torch.cuda.device_count", lambda: 8), - patch("torch.cuda.get_device_name", lambda *a, **k: "H100"), - patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)), - patch( - "torch.cuda.get_device_properties", - lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132} - )(), - ), -] -for p in _PATCHES: - p.start() - -N_LAYERS = int(os.environ.get("N_LAYERS", "0")) -SEQLEN = int(os.environ.get("SEQLEN", str(2048))) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) -ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "0")) - -world_size = 1 -for d in MESH_SHAPE: - world_size *= d -_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")} -mesh_names = _NAMES[len(MESH_SHAPE)] -fake_store = FakeStore() -torch.distributed.init_process_group( - "fake", store=fake_store, rank=0, world_size=world_size -) -mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", MESH_SHAPE, mesh_dim_names=mesh_names -) -ndim = mesh.ndim - -# MODEL=1b is the real LLaMA3-1B; MODEL=small is a tractable proxy whose smaller -# tensors yield few enough decision variables that the exact ILP/LP-bound finish -# on a 3D mesh (where the 1B PuLP problem has ~8M variables and is impractical), -# letting us certify the approximate solver's gap on real 3D structure. -MODEL = os.environ.get("MODEL", "1b") -vocab_size = 1024 if MODEL == "small" else 128256 -batch_size = 2 * mesh.shape[0] -seqlen = SEQLEN - - -def model_fn(): - if MODEL == "small": - args = TransformerModelArgs( - dim=256, n_layers=4, n_heads=8, n_kv_heads=4, - multiple_of=64, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - else: - args = TransformerModelArgs( - dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, - ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000, - vocab_size=vocab_size, max_seq_len=seqlen, - ) - if N_LAYERS: - args.n_layers = N_LAYERS - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda") - - -# Canonical TP plan: column-parallel q/k/v/w1/w3, row-parallel wo/w2, pinning -# only the tensor-parallel (last) mesh axis; data/cp axes left to the optimizer. -COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),) -ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),) - - -def annotate_tp_plan(autop): - for proj in ["wq", "wk", "wv"]: - autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL) - autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL) - for proj in ["w1", "w3"]: - autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL) - autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL) - - -def add_constraints(autop): - x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1) - out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([x_sharding]) - autop.add_output_constraints([out_sharding]) - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) - -log(f"=== LLaMA3-{MODEL} mesh={MESH_SHAPE}{mesh_names} world={world_size} " - f"seqlen={seqlen} vocab={vocab_size} layers={N_LAYERS or '(default)'} ===") -results = {} # name -> dict(build, solve, total, obj) - - -def build(build_pulp): - t = time.perf_counter() - autop = AutoParallel( - model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, - solver="ilp" if build_pulp else "approx", - ) - autop.__enter__() - add_constraints(autop) - return autop, time.perf_counter() - t - - -# ---------- full PuLP build: prune (ILP) + annotated (ILP) + LP bound ---------- -autop_full, build_full = build(build_pulp=True) -opt = autop_full.sharding_optimizer -log(f"\n[full build] {build_full:.2f}s decision_vars={len(opt.decision_vars)} " - f"pulp_vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)}") - -# prune: exact ILP solve. preprocess-off is part of the prune optimization, and -# _apply_memory_constraint installs the same budget the approx solver enforces, -# so every config solves the identical constrained problem. -opt._set_objective() -opt._apply_memory_constraint() -kw = {"msg": False, "options": ["preprocess off"]} -if ILP_TIMEOUT > 0: - kw["timeLimit"] = ILP_TIMEOUT -t = time.perf_counter() -opt.prob.solve(pulp.PULP_CBC_CMD(**kw)) -t_ilp = time.perf_counter() - t -obj_opt = pulp.value(opt.prob.objective) -ilp_status = pulp.LpStatus[opt.prob.status] -results["prune"] = dict(build=build_full, solve=t_ilp, total=build_full + t_ilp, - obj=obj_opt) -log(f"[prune ] ILP solve {t_ilp:8.2f}s obj={obj_opt:11.1f} status={ilp_status}") - -# LP-relaxation lower bound: certifies the optimality gap without a full ILP -# (this sharding LP is empirically integral, so the bound equals the optimum). -lb_res = opt.get_lower_bound(verbose=False) -lb = lb_res.objective -log(f"[LP-bound ] solve {lb_res.solve_s:8.2f}s lower_bound={lb:11.1f}") - -# annotated: propagate the TP plan, then exact ILP solve on the reduced problem. -annotate_tp_plan(autop_full) -t = time.perf_counter() -prop = autop_full.propagate_annotations(verbose=False, method="fix") -t_prop_full = time.perf_counter() - t -opt._apply_memory_constraint() -t = time.perf_counter() -opt.prob.solve(pulp.PULP_CBC_CMD(**kw)) -t_ilp_ann = time.perf_counter() - t -obj_ann = pulp.value(opt.prob.objective) -results["annotated"] = dict(build=build_full, solve=t_prop_full + t_ilp_ann, - total=build_full + t_prop_full + t_ilp_ann, obj=obj_ann) -log(f"[annotated] propagate {t_prop_full:.2f}s + ILP {t_ilp_ann:.2f}s " - f"obj={obj_ann:11.1f} (pinned {prop.nodes_determined} nodes, " - f"-{100*prop.reduction:.0f}% strategies)") - -# Tear down before the next build: AutoParallel installs a FakeTensorMode, and -# two entered instances can't coexist. -autop_full.__exit__(None, None, None) - -# ---------- lite build: dp=prune+approx + merged=prune+approx+annotated ------- -autop_lite, build_lite = build(build_pulp=False) -opt_l = autop_lite.sharding_optimizer -log(f"\n[lite build] {build_lite:.2f}s decision_vars={len(opt_l.decision_vars)} " - f"pulp_vars={len(opt_l.pulp_variables)} (no PuLP problem)") - -# dp: approximate solve, no annotations. -t = time.perf_counter() -ApproximateShardingSolver(opt_l).get_solution(verbose=False) -t_approx_dp = time.perf_counter() - t -obj_dp = opt_l.profile["approximate"]["objective"] -results["dp"] = dict(build=build_lite, solve=t_approx_dp, total=build_lite + t_approx_dp, - obj=obj_dp) -log(f"[dp ] approx solve {t_approx_dp:8.2f}s obj={obj_dp:11.1f}") - -# merged: propagate the TP plan, then approximate solve on the reduced problem. -annotate_tp_plan(autop_lite) -t = time.perf_counter() -prop_l = autop_lite.propagate_annotations(verbose=False, method="fix") -t_prop_lite = time.perf_counter() - t -t = time.perf_counter() -ApproximateShardingSolver(opt_l).get_solution(verbose=False) -t_approx_merged = time.perf_counter() - t -obj_merged = opt_l.profile["approximate"]["objective"] -results["merged"] = dict(build=build_lite, solve=t_prop_lite + t_approx_merged, - total=build_lite + t_prop_lite + t_approx_merged, obj=obj_merged) -log(f"[merged ] propagate {t_prop_lite:.2f}s + approx {t_approx_merged:.2f}s " - f"obj={obj_merged:11.1f} (pinned {prop_l.nodes_determined} nodes)") - -autop_lite.__exit__(None, None, None) - -# ---------- report ---------- -# Optimality reference: exact ILP optimum if CBC proved it, else the LP lower -# bound (this sharding LP is empirically integral, so lb == optimum). -optimal = obj_opt if ilp_status == "Optimal" else lb -opt_label = "ILP optimum" if ilp_status == "Optimal" else "LP lower bound" - -LABELS = { - "prune": "prune (ILP)", - "annotated": "annotated (ILP)", - "dp": "prune+dp (approx)", - "merged": "prune+dp+anno", -} -log("\n" + "=" * 78) -log(f"{'config':<20}{'build(s)':>10}{'solve(s)':>10}{'total(s)':>10}" - f"{'objective':>13}{'gap%':>9}") -log("-" * 78) -for name in ["prune", "annotated", "dp", "merged"]: - r = results[name] - gap = 100 * (r["obj"] - optimal) / optimal - log(f"{LABELS[name]:<20}{r['build']:>10.2f}{r['solve']:>10.2f}{r['total']:>10.2f}" - f"{r['obj']:>13.1f}{gap:>+9.2f}") -log("=" * 78) -log(f"optimality reference: {opt_label} = {optimal:.1f} (ILP status={ilp_status})") - -# Core joint optimization is prune + dp (the approximate solver on the pruned -# space); annotation is the optional extra speedup. Report both gaps. -gap_core = 100 * (obj_dp - optimal) / optimal -gap_full = 100 * (obj_merged - optimal) / optimal -log(f"\nobjective gap vs {opt_label}:") -log(f" prune+dp (approx) : {gap_core:+.2f}% (core: prune + dp)") -log(f" prune+dp+annotated : {gap_full:+.2f}% (+ optional annotation)") - -# Timing: the joint solver must beat each ILP-based individual optimization. -# (dp alone == approx WITHOUT prune is measured against the dp_solver checkout -# separately; prune makes the joint build/solve strictly cheaper than that.) -log("\njoint total time (build+solve) vs each individual optimization:") -all_faster = True -for joint in ["dp", "merged"]: - tj = results[joint]["total"] - line_ok = True - for name in ["prune", "annotated"]: - to = results[name]["total"] - faster = tj < to - line_ok = line_ok and faster - log(f" {LABELS[joint]:<18} {tj:7.2f}s {'<' if faster else '>='} " - f"{LABELS[name]:<16} {to:7.2f}s {to / tj:5.1f}x " - f"{'OK' if faster else 'FAIL'}") - all_faster = all_faster and line_ok - -log("\n" + "=" * 78) -# The full three-way joint (prune + dp + annotated) is the deliverable: the -# approx solver alone is ~20% off, but the propagated TP plan steers it to the -# optimum. Annotation is therefore what meets the accuracy bar; prune+dp alone -# trades accuracy for a little more speed. -ok_gap = abs(gap_full) <= 10.0 -log(f"ACCEPTANCE gap<=10% (full joint prune+dp+anno): {ok_gap} " - f"(full={gap_full:+.2f}%, <=5%: {abs(gap_full) <= 5.0})") -log(f" (informational: prune+dp without annotation = {gap_core:+.2f}%)") -log(f"ACCEPTANCE joint faster than ILP-based optimizations: {all_faster}") -log(f"OVERALL: {'PASS' if ok_gap and all_faster else 'CHECK'}") diff --git a/examples/_bench_sizes.py b/examples/_bench_sizes.py deleted file mode 100644 index 46962209..00000000 --- a/examples/_bench_sizes.py +++ /dev/null @@ -1,166 +0,0 @@ -"""e2e prune+dp (approx) search across LLaMA3 sizes: latency + accuracy. - -For one MODEL on one MESH: - * latency: lite build (build_pulp=False) + ApproximateShardingSolver -> the - production prune+dp path (build_s, approx_s, total, objective). - * accuracy: a separate full PuLP build -> HiGHS LP-relaxation lower bound - (this sharding LP is integral, so the bound equals the exact ILP optimum); - gap = (approx_obj - lb) / lb. - -Env: MODEL (1b|3b|8b|70b), MESH (e.g. 2,4,8), SEQLEN. One model per process. -""" -import gc -import logging -import os -import time -from unittest.mock import patch - -import numpy as np -import pulp -import scipy.sparse as sp -import torch -from scipy.optimize import linprog -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "1b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(",")) -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] - -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs( - rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL] - ) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -def constrain(autop): - x = (Shard(0),) + (Replicate(),) * (ndim - 1) - out = (Shard(0), Shard(2)) if ndim == 2 else x - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([x]) - autop.add_output_constraints([out]) - - -def lp_lower_bound_highs(opt): - """LP relaxation (binaries -> [0,1]) of the built problem, solved with HiGHS. - Objective is read from decision_vars and constraints from prob.constraints - using id()-keyed indexing (avoids hashing the long PuLP var names).""" - opt._set_objective() - opt._apply_memory_constraint() - variables = opt.prob.variables() - vidx = {id(v): i for i, v in enumerate(variables)} - n = len(variables) - c = np.zeros(n) - for key, dv in opt.decision_vars.items(): - mult = 1 + len(opt._root_to_copies.get(key[0], ())) - c[vidx[id(dv.var)]] += dv.cost * mult - re = ru = 0 - reqr, reqc, reqd, beq = [], [], [], [] - rubr, rubc, rubd, bub = [], [], [], [] - for con in opt.prob.constraints.values(): - rhs = -con.constant - if con.sense == pulp.LpConstraintEQ: - for v, co in con.items(): - reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co) - beq.append(rhs); re += 1 - else: - s = 1.0 if con.sense == pulp.LpConstraintLE else -1.0 - for v, co in con.items(): - rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(s * co) - bub.append(s * rhs); ru += 1 - A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None - A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None - # Dual simplex: far faster than the barrier (IPM) on this near-integral, - # network-flow-like LP. We only need the optimal objective as the bound. - method = os.environ.get("LP_METHOD", "highs-ds") - res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None), - bounds=(0, 1), method=method, options={"disp": True}) - if not res.success: - raise RuntimeError(f"HiGHS failed: {res.message}") - return res.fun, n, re + ru - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ###", flush=True) - -# ---- latency: lite build + prune+dp approx (production path) ---- -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx") -autop.__enter__() -constrain(autop) -build_lite = time.perf_counter() - t -opt = autop.sharding_optimizer -n_dv = len(opt.decision_vars) -params = opt.profile["model"]["parameter_numel"] -t = time.perf_counter() -ApproximateShardingSolver(opt).get_solution(verbose=False) -approx_s = time.perf_counter() - t -obj = opt.profile["approximate"]["objective"] -print(f"[latency] params={params/1e9:.2f}B lite_build={build_lite:.1f}s " - f"approx={approx_s:.1f}s total={build_lite + approx_s:.1f}s " - f"decision_vars={n_dv} obj={obj:.1f}", flush=True) -autop.__exit__(None, None, None) -del autop, opt -gc.collect() - -if os.environ.get("ACCURACY", "1") != "1": - print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B " - f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s " - f"total={build_lite+approx_s:.1f}s obj={obj:.1f} (LP skipped)", flush=True) - raise SystemExit(0) - -# ---- accuracy: full build + HiGHS LP lower bound ---- -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp") -autop.__enter__() -constrain(autop) -full_build = time.perf_counter() - t -opt = autop.sharding_optimizer -t = time.perf_counter() -lb, nvar, ncon = lp_lower_bound_highs(opt) -lp_s = time.perf_counter() - t -gap = 100 * (obj - lb) / lb -print(f"[accuracy] full_build={full_build:.1f}s lp_solve={lp_s:.1f}s " - f"lower_bound={lb:.1f} vars={nvar} cons={ncon}", flush=True) -print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B " - f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s total={build_lite+approx_s:.1f}s " - f"obj={obj:.1f} LP_lb={lb:.1f} gap={gap:+.2f}%", flush=True) diff --git a/examples/_bench_trws.py b/examples/_bench_trws.py deleted file mode 100644 index 4e4fbc2d..00000000 --- a/examples/_bench_trws.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Prototype TRW-S (tree-reweighted sequential message passing) on the approx -solver's faithful factor graph, validated against the CBC-exact optimum. If TRW-S -(optionally + the existing local search) reaches the optimum where plain min-sum -BP does not, it is the fix. Env: MODEL, MESH, SEQLEN, ITERS.""" -import logging -import os -import time -from unittest.mock import patch - -import numpy as np -import pulp -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel -from autoparallel.approximate_sharding import ApproximateShardingSolver -from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config -from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config - -logging.basicConfig(level=logging.ERROR) -for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"), - ("get_device_capability", lambda *a, **k: (9, 0))]: - patch(f"torch.cuda.{fn}", val).start() -patch("torch.cuda.get_device_properties", lambda *a, **k: type( - "P", (), {"major": 9, "minor": 0, "name": "H100", - "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start() - -MODEL = os.environ.get("MODEL", "1b") -SEQLEN = int(os.environ.get("SEQLEN", "2048")) -MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(",")) -ITERS = int(os.environ.get("ITERS", "1000")) -USE_CBC = os.environ.get("CBC", "1") == "1" -ws = 1 -for d in MESH_SHAPE: - ws *= d -names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)] -torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws) -mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names) -ndim = mesh.ndim -vocab_size = 128256 -batch_size = 2 * mesh.shape[0] -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), - "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096), -} - - -def model_fn(): - args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size, - max_seq_len=SEQLEN, **_CFG[MODEL]) - with torch.device("meta"): - return Transformer(args) - - -def input_fn(): - return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda") - - -def constrain(autop): - x = (Shard(0),) + (Replicate(),) * (ndim - 1) - out = (Shard(0), Shard(2)) if ndim == 2 else x - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([x]) - autop.add_output_constraints([out]) - - -set_nccl_topo_config(detect_nccl_topo_config(mesh)) -mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) -print(f"### TRW-S MODEL={MODEL} mesh={MESH_SHAPE}{names} iters={ITERS} ###", flush=True) - -backend = "ilp" if USE_CBC else "approx" -t = time.perf_counter() -autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver=backend) -autop.__enter__() -constrain(autop) -opt = autop.sharding_optimizer -print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True) - -obj_cbc = None -if USE_CBC: - opt._set_objective() - opt._apply_memory_constraint() - t = time.perf_counter() - opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"])) - obj_cbc = pulp.value(opt.prob.objective) - print(f"[cbc] obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]} " - f"({time.perf_counter()-t:.1f}s)", flush=True) - - -_REF = obj_cbc if obj_cbc else float(os.environ.get("LP_LB", "0")) or None - - -def gap(o): - return 100 * (o - _REF) / _REF if _REF else float("nan") - - -# Stock approx (BP + local search) for comparison. -a0 = ApproximateShardingSolver(opt) -t = time.perf_counter() -a0.get_solution(verbose=False) -print(f"[stock approx] obj={opt.profile['approximate']['objective']:.1f} " - f"gap={gap(opt.profile['approximate']['objective']):+.2f}% ({time.perf_counter()-t:.1f}s)", flush=True) - -# Build a fresh factor graph for TRW-S. -A = ApproximateShardingSolver(opt) -A._build_problem() -A._build_factors() -G = len(A.groups) -nbrs = A.nbrs -unary = A.g_unary -order = sorted(range(G), key=lambda g: min(A.groups[g].members)) -pos = [0] * G -for i, g in enumerate(order): - pos[g] = i -gamma = [] -for g in range(G): - indeg = sum(1 for h in nbrs[g] if pos[h] < pos[g]) - outdeg = sum(1 for h in nbrs[g] if pos[h] > pos[g]) - gamma.append(1.0 / max(1, max(indeg, outdeg))) - -msg = {} -for g in range(G): - for h in nbrs[g]: - msg[(g, h)] = np.zeros(len(unary[h])) - -t = time.perf_counter() -best = float("inf") -best_snap = None -for it in range(ITERS): - for forward in (True, False): - seq = order if forward else order[::-1] - for p in seq: - if not nbrs[p]: - continue - agg = unary[p].copy() - for r in nbrs[p]: - agg += msg[(r, p)] - wp = gamma[p] * agg - for q in nbrs[p]: - if (pos[q] > pos[p]) != forward: - continue - P = A._pair_matrix(p, q) # (D_p, D_q) - mm = (wp - msg[(q, p)])[:, None] + P - mq = mm.min(axis=0) - mq -= mq.min() - msg[(p, q)] = mq - A._decode(msg) - e = A._fast_total_energy() - if e < best - 1e-6: - best = e - best_snap = [g.current for g in A.groups] - if it < 5 or it % 50 == 0: - print(f" [trws it={it}] decode_energy={e:.1f} best={best:.1f} gap={gap(best):+.2f}%", flush=True) -trws_s = time.perf_counter() - t -for gid, ci in enumerate(best_snap): - A._set_group(gid, ci) -print(f"[TRW-S] best={best:.1f} gap={gap(best):+.2f}% ({trws_s:.1f}s, {ITERS} iters)", flush=True) - -# Polish TRW-S result with the existing local search. -deadline = time.perf_counter() + 60 -A._memory_repair() -A._coordinate_descent(deadline) -A._star_block_search(deadline) -polished = A._fast_total_energy() -print(f"[TRW-S + local search] obj={polished:.1f} gap={gap(polished):+.2f}%", flush=True) -print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} cbc={obj_cbc} " - f"stock_gap={gap(opt.profile['approximate']['objective']):+.2f}% " - f"trws_gap={gap(best):+.2f}% trws_ls_gap={gap(polished):+.2f}%", flush=True) diff --git a/examples/_sanity_llama3.py b/examples/_sanity_llama3.py deleted file mode 100644 index 6a44b386..00000000 --- a/examples/_sanity_llama3.py +++ /dev/null @@ -1,223 +0,0 @@ -"""Real LLaMA3 AutoParallel training sanity check on a 2D or 3D mesh. - -Traces the model, picks a sharding strategy with the approximate (TRW-S) solver, -applies it as DTensor, and trains a fixed random batch for a few steps on real -GPUs. Pass: the loss curve goes down. Adapted from example_sanity_check_qwen3.py. - -The batch is data-parallel on the `dp` axis only; any other axes (`cp`, `tp`) -are model-sharding axes (the solver shards params/activations over them). Logits -are vocab-parallel on `tp` and replicated on `cp`, so the loss is reduced over -the world and normalized by global_token_count * (world_size // dp_degree). - -Run: torchrun --standalone --nproc-per-node N examples/_sanity_llama3.py --mesh 2,2,8 --model 8b -""" -import argparse -import logging -import os -import time - -import torch -import torch.distributed as dist -import torch.distributed.nn.functional as dist_nn_func -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard - -from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs -from autoparallel.api import AutoParallel - -_CFG = { - "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256), - "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024), -} -_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")} - - -def parse_args(): - p = argparse.ArgumentParser(description="LLaMA3 AutoParallel training sanity check.") - p.add_argument("--model", type=str, default="1b", choices=list(_CFG)) - p.add_argument("--mesh", type=str, default="2,2", help="comma-separated mesh dims") - p.add_argument("--global-batch-size", type=int, default=8) - p.add_argument("--microbatch-size", type=int, default=2) - p.add_argument("--seq-len", type=int, default=512) - p.add_argument("--train-steps", type=int, default=10) - p.add_argument("--lr", type=float, default=1e-3) - p.add_argument("--max-grad-norm", type=float, default=1.0) - p.add_argument("--seed", type=int, default=0) - p.add_argument("--solver", type=str, default="approx") - p.add_argument("--verbose", action="store_true") - return p.parse_args() - - -def init_distributed(args): - if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: - raise RuntimeError("Run with torchrun --standalone --nproc-per-node N ...") - world_size = int(os.environ["WORLD_SIZE"]) - local_rank = int(os.environ["LOCAL_RANK"]) - dims = tuple(int(x) for x in args.mesh.split(",")) - prod = 1 - for d in dims: - prod *= d - if prod != world_size: - raise ValueError(f"WORLD_SIZE {world_size} != prod(mesh) {prod}") - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - dist.init_process_group("nccl", device_id=device) - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", dims, mesh_dim_names=_NAMES[len(dims)] - ) - return device, mesh - - -def placement_for(name, *, is_output): - if name == "dp": - return Shard(0) - if name == "tp" and is_output: - return Shard(2) - return Replicate() - - -def make_local_tokens(args, mesh, device, vocab_size): - names = mesh.mesh_dim_names - dp_rank = mesh.get_coordinate()[names.index("dp")] - dp_degree = mesh["dp"].size() - local_batch_size = args.global_batch_size // dp_degree - gen = torch.Generator(device="cpu") - gen.manual_seed(args.seed) - tokens = torch.randint( - 0, vocab_size, (args.global_batch_size, args.seq_len + 1), - generator=gen, dtype=torch.long, - ) - start = dp_rank * local_batch_size - return tokens[start:start + local_batch_size].to(device, non_blocking=True) - - -def vocab_parallel_cross_entropy(logits, labels, *, vocab_size, tp_group, tp_rank, - tp_degree, normalizer): - local_vocab_size = logits.shape[-1] - vocab_start = tp_rank * local_vocab_size - vocab_stop = vocab_size if tp_rank == tp_degree - 1 else vocab_start + local_vocab_size - logits = logits.float() - local_max = logits.amax(dim=-1) - with torch.no_grad(): - global_max = local_max.detach().clone() - dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group) - shifted = logits - global_max.unsqueeze(-1) - global_exp_sum = dist_nn_func.all_reduce( - shifted.exp().sum(dim=-1), op=dist.ReduceOp.SUM, group=tp_group) - mask = (labels >= vocab_start) & (labels < vocab_stop) - local_target = torch.zeros_like(labels, dtype=torch.long) - local_target[mask] = labels[mask] - vocab_start - local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) - local_target_logits = local_target_logits * mask.to(logits.dtype) - target_logits = dist_nn_func.all_reduce( - local_target_logits, op=dist.ReduceOp.SUM, group=tp_group) - loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() - return loss_sum / normalizer - - -def print_rank0(msg): - if dist.get_rank() == 0: - print(msg, flush=True) - - -def main(): - args = parse_args() - logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING) - device, mesh = init_distributed(args) - names = mesh.mesh_dim_names - world_size = dist.get_world_size() - tp_group = mesh.get_group("tp") - tp_rank = mesh.get_local_rank("tp") - tp_degree = mesh["tp"].size() - dp_degree = mesh["dp"].size() - local_batch_size = args.global_batch_size // dp_degree - grad_accum = local_batch_size // args.microbatch_size - # logits are distinct only across dp (cp/tp replicate the per-token loss), - # so the world all-reduce over-counts by world_size // dp_degree. - normalizer = args.global_batch_size * args.seq_len * (world_size // dp_degree) - - torch.manual_seed(args.seed) - model_args = TransformerModelArgs( - rope_theta=500000, vocab_size=128256, max_seq_len=args.seq_len, **_CFG[args.model], - ) - trace_global_batch = args.microbatch_size * dp_degree - - with torch.device("meta"): - model = Transformer(model_args) - - def input_fn(): - return torch.randint(0, model_args.vocab_size, - (trace_global_batch, args.seq_len), device=device) - - mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) - x_sharding = tuple(placement_for(n, is_output=False) for n in names) - out_sharding = tuple(placement_for(n, is_output=True) for n in names) - print_rank0(f"LLaMA3-{args.model} sanity: mesh={tuple(mesh.shape)}{names} " - f"solver={args.solver} in={x_sharding} out={out_sharding} " - f"global_batch={args.global_batch_size} microbatch={args.microbatch_size} " - f"grad_accum={grad_accum} seq_len={args.seq_len} steps={args.train_steps} lr={args.lr}") - - t0 = time.time() - with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True, - solver=args.solver) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([x_sharding]) - autop.add_output_constraints([out_sharding]) - sharding_placement = autop.optimize_placement(verbose=args.verbose) - parallel_mod = autop.apply_placement(sharding_placement) - print_rank0(f"trace+optimize+apply took {time.time() - t0:.1f}s") - - parallel_mod.to_empty(device=device) - parallel_mod.init_weights(buffer_device=device) - - batch = make_local_tokens(args, mesh, device, model_args.vocab_size) - inputs = batch[:, :-1].contiguous() - labels = batch[:, 1:].contiguous() - input_mbs = inputs.split(args.microbatch_size, dim=0) - label_mbs = labels.split(args.microbatch_size, dim=0) - optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) - - try: - losses = [] - step_times = [] - for step in range(args.train_steps): - torch.cuda.synchronize(device) - t_step = time.perf_counter() - optimizer.zero_grad(set_to_none=True) - step_loss = torch.zeros((), device=device) - for mi, ml in zip(input_mbs, label_mbs): - logits = parallel_mod(mi) - loss = vocab_parallel_cross_entropy( - logits, ml, vocab_size=model_args.vocab_size, tp_group=tp_group, - tp_rank=tp_rank, tp_degree=tp_degree, normalizer=normalizer) - loss.backward() - step_loss = step_loss + loss.detach() - torch.nn.utils.clip_grad_norm_(parallel_mod.parameters(), args.max_grad_norm) - optimizer.step() - torch.cuda.synchronize(device) - step_times.append(time.perf_counter() - t_step) - with torch.no_grad(): - logged = step_loss.clone() - dist.all_reduce(logged, op=dist.ReduceOp.SUM) - losses.append(float(logged.item())) - print_rank0(f"step={step:03d} loss={losses[-1]:.6f} step_time={1000*step_times[-1]:.0f}ms") - - warmup = min(3, max(0, len(step_times) - 2)) - steady = sorted(step_times[warmup:]) - if steady: - mean_ms = 1000 * sum(steady) / len(steady) - print_rank0(f"[latency] solver={args.solver} per-step (excl {warmup} warmup, " - f"{len(steady)} steps): mean={mean_ms:.0f}ms " - f"median={1000*steady[len(steady)//2]:.0f}ms min={1000*steady[0]:.0f}ms") - print_rank0(f"\nloss curve: {[round(x, 4) for x in losses]}") - verdict = "PASS" if losses[-1] < losses[0] else "FAIL" - print_rank0(f"SANITY {verdict}: loss {losses[0]:.4f} -> {losses[-1]:.4f}") - dist.barrier(device_ids=[device.index]) - torch.cuda.synchronize(device) - finally: - if dist.is_initialized(): - dist.destroy_process_group() - - -if __name__ == "__main__": - main() diff --git a/examples/example_llama3.py b/examples/example_llama3.py index 0879e568..00025f5c 100644 --- a/examples/example_llama3.py +++ b/examples/example_llama3.py @@ -28,9 +28,6 @@ ) from autoparallel.graph_passes.debug_helpers import make_custom_runtime_estimation from autoparallel.graph_passes.estimate_graph_metrics import estimate_graph_metrics -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Partial, Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore logging.basicConfig(level=logging.DEBUG) diff --git a/examples/example_qwen3.py b/examples/example_qwen3.py deleted file mode 100644 index 2ae57b00..00000000 --- a/examples/example_qwen3.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import logging -import time - -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard -from torch.testing._internal.distributed.fake_pg import FakeStore - -from autoparallel._testing.models.qwen3 import ( - Qwen3ModelArgs, - Transformer, - qwen3_235b_a22b_args, - qwen3_30b_a3b_args, - qwen3_8b_args, - qwen3_debug_args, - qwen3_moe_debug_args, -) -from autoparallel.api import AutoParallel -from autoparallel.compile import autoparallel_backend - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Trace, optimize, and smoke-test dense Qwen3 with AutoParallel." - ) - parser.add_argument( - "--flavor", - choices=("tiny", "moe-tiny", "debug", "8b", "moe-debug", "30b-a3b", "235b-a22b"), - default="tiny", - help="Qwen3 model size to instantiate. Defaults to tiny for faster runs.", - ) - parser.add_argument( - "--seq-len", - type=int, - default=None, - help="Sequence length. Defaults to 8 for tiny, 512 for debug, and 4096 for 8b.", - ) - parser.add_argument( - "--world-size", - type=int, - default=64, - help="Fake process-group world size.", - ) - parser.add_argument( - "--tp-degree", - type=int, - default=8, - help="Second mesh degree. Used as TP for dense flavors and EP for MoE flavors.", - ) - parser.add_argument( - "--local-batch-size", - type=int, - default=2, - help="Per-DP-rank batch size used for the runtime smoke pass.", - ) - parser.add_argument( - "--save-optimizer", - type=str, - default=None, - help="Optional path for the serialized sharding optimizer state.", - ) - parser.add_argument( - "--compile", - action="store_true", - help="Compile the placed module with the AutoParallel backend before running.", - ) - parser.add_argument( - "--skip-run", - action="store_true", - help="Only run tracing, optimization, and placement application.", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="Print the full AutoParallel optimizer log.", - ) - return parser.parse_args() - - -def make_model_args(flavor: str, seq_len: int): - if flavor == "tiny": - return Qwen3ModelArgs( - dim=64, - n_layers=2, - n_heads=4, - n_kv_heads=2, - head_dim=16, - hidden_dim=128, - vocab_size=128, - max_seq_len=seq_len, - ) - if flavor == "moe-tiny": - return Qwen3ModelArgs( - dim=64, - n_layers=1, - n_heads=4, - n_kv_heads=2, - head_dim=16, - hidden_dim=128, - vocab_size=128, - max_seq_len=seq_len, - moe_enabled=True, - moe_hidden_dim=32, - num_experts=8, - top_k=2, - route_norm=True, - score_before_experts=False, - ) - if flavor == "debug": - return qwen3_debug_args(max_seq_len=seq_len) - if flavor == "8b": - return qwen3_8b_args(max_seq_len=seq_len) - if flavor == "moe-debug": - return qwen3_moe_debug_args(max_seq_len=seq_len) - if flavor == "30b-a3b": - return qwen3_30b_a3b_args(max_seq_len=seq_len) - if flavor == "235b-a22b": - return qwen3_235b_a22b_args(max_seq_len=seq_len) - raise ValueError(f"Unknown Qwen3 flavor: {flavor}") - - -def main(): - args = parse_args() - logging.basicConfig(level=logging.DEBUG) - - seq_len = args.seq_len - if seq_len is None: - seq_len = { - "tiny": 8, - "moe-tiny": 8, - "debug": 512, - "8b": 4096, - "moe-debug": 512, - "30b-a3b": 4096, - "235b-a22b": 4096, - }[args.flavor] - if args.world_size % args.tp_degree != 0: - raise ValueError( - f"world-size ({args.world_size}) must be divisible by " - f"tp-degree ({args.tp_degree})." - ) - - if not torch.distributed.is_initialized(): - fake_store = FakeStore() - torch.distributed.init_process_group( - "fake", - store=fake_store, - rank=0, - world_size=args.world_size, - ) - - model_args = make_model_args(args.flavor, seq_len) - mesh_dim_names = ("dp", "ep") if model_args.moe_enabled else ("dp", "tp") - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - (args.world_size // args.tp_degree, args.tp_degree), - mesh_dim_names=mesh_dim_names, - ) - device = torch.device("cuda") - - global_batch_size = args.local_batch_size * mesh.shape[0] - if model_args.moe_enabled: - global_batch_size *= mesh.shape[1] - - with torch.device("meta"): - model = Transformer( - model_args, - mesh=mesh if model_args.moe_enabled else None, - moe_axis_name=mesh.mesh_dim_names[1], - ) - - def input_fn(): - return torch.randint( - 0, - model_args.vocab_size, - (global_batch_size, seq_len), - device=device, - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - ) - - t0 = time.time() - with AutoParallel( - model, - input_fn, - mesh, - mp_policy, - dynamic=model_args.moe_enabled, - repeated_subgraphs=True, - ) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - - x_sharding = (Shard(0), Shard(0)) if model_args.moe_enabled else (Shard(0), Replicate()) - out_sharding = (Shard(0), Shard(2)) - autop.add_input_constraints([x_sharding]) - autop.add_output_constraints([out_sharding]) - - sharding_placement = autop.optimize_placement(verbose=args.verbose) - print(f"Tracing + optimization took {time.time() - t0:.1f}s") - - if args.save_optimizer is not None: - autop.sharding_optimizer.save(args.save_optimizer) - autop.sharding_optimizer.save_placements( - f"{args.save_optimizer}.placements.json" - ) - - parallel_mod = autop.apply_placement(sharding_placement) - - if args.skip_run: - print("Placement applied successfully.") - return - - parallel_mod.to_empty(device=device) - parallel_mod.init_weights(buffer_device=device) # type: ignore[operator] - - if args.compile: - parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) - - tokens = torch.randint( - 0, - model_args.vocab_size, - (args.local_batch_size, seq_len), - device=device, - ) - out = parallel_mod(tokens) - if torch.any(torch.isnan(out)): - raise RuntimeError("Found NaNs in Qwen3 forward output.") - out.backward(torch.randn_like(out)) - print("All good!") - - -if __name__ == "__main__": - main() diff --git a/examples/example_sanity_check_qwen3.py b/examples/example_sanity_check_qwen3.py deleted file mode 100644 index b7af6c0d..00000000 --- a/examples/example_sanity_check_qwen3.py +++ /dev/null @@ -1,335 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import logging -import os -import time - -import torch -import torch.distributed as dist -import torch.distributed.nn.functional as dist_nn_func -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard - -from autoparallel._testing.models.qwen3 import Transformer, qwen3_8b_args -from autoparallel.api import AutoParallel -from autoparallel.compile import autoparallel_backend - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Run a real Qwen3 8B AutoParallel training sanity check." - ) - parser.add_argument( - "--global-batch-size", - type=int, - default=16, - help="Global batch size across data-parallel ranks.", - ) - parser.add_argument( - "--microbatch-size", - type=int, - default=1, - help="Per-DP-rank microbatch size for gradient accumulation.", - ) - parser.add_argument( - "--seq-len", - type=int, - default=4096, - help="Sequence length. Defaults to Qwen3 8B's max sequence length.", - ) - parser.add_argument( - "--dp-degree", - type=int, - default=2, - help="Data-parallel mesh degree.", - ) - parser.add_argument( - "--tp-degree", - type=int, - default=2, - help="Tensor-parallel mesh degree.", - ) - parser.add_argument( - "--train-steps", - type=int, - default=20, - help="Number of optimizer steps.", - ) - parser.add_argument( - "--lr", - type=float, - default=3e-4, - help="AdamW learning rate.", - ) - parser.add_argument( - "--max-grad-norm", - type=float, - default=1.0, - help="Gradient clipping max norm.", - ) - parser.add_argument( - "--seed", - type=int, - default=0, - help="Seed for model initialization and synthetic data generation.", - ) - parser.add_argument( - "--compile", - action="store_true", - help="Compile the placed module with the AutoParallel backend before training.", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="Print the full AutoParallel optimizer log.", - ) - return parser.parse_args() - - -def init_distributed(args): - if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: - raise RuntimeError( - "Run this example with torchrun, e.g. " - "torchrun --standalone --nproc-per-node 4 " - "examples/example_sanity_check_qwen3.py" - ) - - world_size = int(os.environ["WORLD_SIZE"]) - local_rank = int(os.environ["LOCAL_RANK"]) - expected_world_size = args.dp_degree * args.tp_degree - if world_size != expected_world_size: - raise ValueError( - f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree " - f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})." - ) - if args.global_batch_size % args.dp_degree != 0: - raise ValueError( - f"global-batch-size ({args.global_batch_size}) must be divisible by " - f"dp-degree ({args.dp_degree})." - ) - local_batch_size = args.global_batch_size // args.dp_degree - if local_batch_size % args.microbatch_size != 0: - raise ValueError( - f"local batch size ({local_batch_size}) must be divisible by " - f"microbatch-size ({args.microbatch_size})." - ) - - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - dist.init_process_group("nccl", device_id=device) - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - (args.dp_degree, args.tp_degree), - mesh_dim_names=("dp", "tp"), - ) - return device, mesh - - -def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor: - coordinate = mesh.get_coordinate() - if coordinate is None: - raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.") - dp_rank, _tp_rank = coordinate - local_batch_size = args.global_batch_size // args.dp_degree - - generator = torch.Generator(device="cpu") - generator.manual_seed(args.seed) - tokens = torch.randint( - 0, - vocab_size, - (args.global_batch_size, args.seq_len + 1), - generator=generator, - dtype=torch.long, - ) - - start = dp_rank * local_batch_size - stop = start + local_batch_size - return tokens[start:stop].to(device, non_blocking=True) - - -def vocab_parallel_cross_entropy( - logits: torch.Tensor, - labels: torch.Tensor, - *, - vocab_size: int, - tp_group, - tp_rank: int, - tp_degree: int, - global_token_count: int, -) -> torch.Tensor: - if logits.shape[:2] != labels.shape: - raise ValueError( - f"logits shape {tuple(logits.shape)} is incompatible with " - f"labels shape {tuple(labels.shape)}." - ) - - local_vocab_size = logits.shape[-1] - vocab_start = tp_rank * local_vocab_size - vocab_stop = vocab_start + local_vocab_size - if tp_rank == tp_degree - 1: - vocab_stop = vocab_size - - logits = logits.float() - local_max = logits.amax(dim=-1) - with torch.no_grad(): - global_max = local_max.detach().clone() - dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group) - - shifted_logits = logits - global_max.unsqueeze(-1) - local_exp_sum = shifted_logits.exp().sum(dim=-1) - global_exp_sum = dist_nn_func.all_reduce( - local_exp_sum, - op=dist.ReduceOp.SUM, - group=tp_group, - ) - - target_mask = (labels >= vocab_start) & (labels < vocab_stop) - local_target = torch.zeros_like(labels, dtype=torch.long) - local_target[target_mask] = labels[target_mask] - vocab_start - local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) - local_target_logits = local_target_logits * target_mask.to(logits.dtype) - target_logits = dist_nn_func.all_reduce( - local_target_logits, - op=dist.ReduceOp.SUM, - group=tp_group, - ) - - loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() - return loss_sum / (global_token_count * tp_degree) - - -def print_rank0(message: str) -> None: - if dist.get_rank() == 0: - print(message, flush=True) - - -def main(): - args = parse_args() - logging.basicConfig(level=logging.DEBUG) - - device, mesh = init_distributed(args) - tp_group = mesh.get_group("tp") - tp_rank = mesh.get_local_rank("tp") - local_batch_size = args.global_batch_size // args.dp_degree - gradient_accumulation_steps = local_batch_size // args.microbatch_size - - torch.manual_seed(args.seed) - model_args = qwen3_8b_args(max_seq_len=args.seq_len) - trace_global_batch_size = args.microbatch_size * args.dp_degree - - with torch.device("meta"): - model = Transformer(model_args) - - def input_fn(): - return torch.randint( - 0, - model_args.vocab_size, - (trace_global_batch_size, args.seq_len), - device=device, - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - ) - - print_rank0( - "Qwen3 8B sanity check: " - f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), " - f"global_batch={args.global_batch_size}, " - f"local_batch={local_batch_size}, " - f"microbatch={args.microbatch_size}, " - f"grad_accum={gradient_accumulation_steps}, " - f"trace_global_batch={trace_global_batch_size}, " - f"seq_len={args.seq_len}" - ) - - t0 = time.time() - with AutoParallel( - model, - input_fn, - mesh, - mp_policy, - repeated_subgraphs=True, - ) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([(Shard(0), Replicate())]) - autop.add_output_constraints([(Shard(0), Shard(2))]) - sharding_placement = autop.optimize_placement(verbose=args.verbose) - parallel_mod = autop.apply_placement(sharding_placement) - - print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s") - - parallel_mod.to_empty(device=device) - parallel_mod.init_weights(buffer_device=device, seed=args.seed) # type: ignore[operator] - - if args.compile: - parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) - - batch = make_local_tokens(args, mesh, device, model_args.vocab_size) - inputs = batch[:, :-1].contiguous() - labels = batch[:, 1:].contiguous() - input_microbatches = inputs.split(args.microbatch_size, dim=0) - label_microbatches = labels.split(args.microbatch_size, dim=0) - global_token_count = args.global_batch_size * args.seq_len - optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) - - try: - losses: list[float] = [] - for step in range(args.train_steps): - optimizer.zero_grad(set_to_none=True) - step_loss = torch.zeros((), device=device) - for micro_inputs, micro_labels in zip( - input_microbatches, label_microbatches - ): - logits = parallel_mod(micro_inputs) - if torch.any(torch.isnan(logits)): - raise RuntimeError("Found NaNs in Qwen3 forward output.") - - loss = vocab_parallel_cross_entropy( - logits, - micro_labels, - vocab_size=model_args.vocab_size, - tp_group=tp_group, - tp_rank=tp_rank, - tp_degree=args.tp_degree, - global_token_count=global_token_count, - ) - if torch.any(torch.isnan(loss)): - raise RuntimeError("Found NaNs in Qwen3 training loss.") - - loss.backward() - step_loss = step_loss + loss.detach() - - torch.nn.utils.clip_grad_norm_( - parallel_mod.parameters(), args.max_grad_norm - ) - optimizer.step() - - with torch.no_grad(): - logged_loss = step_loss.clone() - dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM) - loss_value = float(logged_loss.item()) - losses.append(loss_value) - print_rank0(f"step={step:03d} loss={loss_value:.6f}") - - if losses[-1] >= losses[0]: - raise RuntimeError( - f"Qwen3 training loss did not improve: initial={losses[0]:.6f}, " - f"final={losses[-1]:.6f}" - ) - - print_rank0(f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}") - dist.barrier(device_ids=[device.index]) - torch.cuda.synchronize(device) - finally: - if dist.is_initialized(): - dist.destroy_process_group() - - -if __name__ == "__main__": - main() diff --git a/examples/example_sanity_check_qwen3_moe.py b/examples/example_sanity_check_qwen3_moe.py deleted file mode 100644 index dd16afb7..00000000 --- a/examples/example_sanity_check_qwen3_moe.py +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import logging -import os -import time - -import torch -import torch.distributed as dist -import torch.distributed.nn.functional as dist_nn_func -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Shard - -from autoparallel._testing.models.qwen3 import ( - Qwen3ModelArgs, - Transformer, - qwen3_235b_a22b_args, - qwen3_30b_a3b_args, - qwen3_moe_debug_args, -) -from autoparallel.api import AutoParallel -from autoparallel.compile import autoparallel_backend - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Run a real Qwen3 MoE AutoParallel training sanity check." - ) - parser.add_argument( - "--flavor", - choices=("moe-tiny", "moe-debug", "30b-a3b", "235b-a22b"), - default="30b-a3b", - help="Qwen3 MoE model size. Defaults to the real Qwen3-30B-A3B model.", - ) - parser.add_argument( - "--global-batch-size", - type=int, - default=4, - help="Global batch size across data-parallel ranks.", - ) - parser.add_argument( - "--microbatch-size", - type=int, - default=1, - help="Per-rank input microbatch size before EP all-gather inside the model.", - ) - parser.add_argument( - "--seq-len", - type=int, - default=8192, - help="Sequence length. Defaults to 8192 for the 4xH100 sanity run.", - ) - parser.add_argument( - "--dp-degree", - type=int, - default=2, - help="Data-parallel mesh degree.", - ) - parser.add_argument( - "--ep-degree", - type=int, - default=2, - help="Expert-parallel mesh degree.", - ) - parser.add_argument( - "--train-steps", - type=int, - default=30, - help="Number of optimizer steps.", - ) - parser.add_argument( - "--lr", - type=float, - default=3e-4, - help="Optimizer learning rate.", - ) - parser.add_argument( - "--optimizer", - choices=("adamw", "sgd", "none"), - default="adamw", - help="Optimizer to use after backward. Use sgd/none for large-model memory smoke runs.", - ) - parser.add_argument( - "--max-grad-norm", - type=float, - default=1.0, - help="Gradient clipping max norm.", - ) - parser.add_argument( - "--loss-chunk-size", - type=int, - default=512, - help=( - "Sequence chunk size for vocab-parallel cross entropy. " - "Keeps the 8192-token real-model run from materializing full-size " - "float logits and exp buffers at once." - ), - ) - parser.add_argument( - "--skip-loss-improvement-check", - action="store_true", - help="Only require finite forward/backward/optimizer steps.", - ) - parser.add_argument( - "--seed", - type=int, - default=0, - help="Seed for model initialization and synthetic data generation.", - ) - parser.add_argument( - "--compile", - action="store_true", - help="Compile the placed module with the AutoParallel backend before training.", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="Print the full AutoParallel optimizer log.", - ) - return parser.parse_args() - - -def make_model_args(flavor: str, seq_len: int | None) -> Qwen3ModelArgs: - if flavor == "moe-tiny": - max_seq_len = 512 if seq_len is None else seq_len - return Qwen3ModelArgs( - dim=64, - n_layers=1, - n_heads=4, - n_kv_heads=2, - head_dim=16, - hidden_dim=128, - vocab_size=128, - max_seq_len=max_seq_len, - moe_enabled=True, - moe_hidden_dim=32, - num_experts=8, - top_k=2, - route_norm=True, - score_before_experts=False, - moe_axis_name="ep", - ) - overrides = {"moe_axis_name": "ep"} - if seq_len is not None: - overrides["max_seq_len"] = seq_len - if flavor == "moe-debug": - return qwen3_moe_debug_args(**overrides) - if flavor == "30b-a3b": - return qwen3_30b_a3b_args(**overrides) - if flavor == "235b-a22b": - return qwen3_235b_a22b_args(**overrides) - raise ValueError(f"Unknown Qwen3 MoE flavor: {flavor}") - - -def init_distributed(args): - if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: - raise RuntimeError( - "Run this example with torchrun, e.g. " - "torchrun --standalone --nproc-per-node 4 " - "examples/example_sanity_check_qwen3_moe.py" - ) - - world_size = int(os.environ["WORLD_SIZE"]) - local_rank = int(os.environ["LOCAL_RANK"]) - expected_world_size = args.dp_degree * args.ep_degree - if world_size != expected_world_size: - raise ValueError( - f"WORLD_SIZE ({world_size}) must equal dp-degree * ep-degree " - f"({args.dp_degree} * {args.ep_degree} = {expected_world_size})." - ) - if args.global_batch_size % args.dp_degree != 0: - raise ValueError( - f"global-batch-size ({args.global_batch_size}) must be divisible by " - f"dp-degree ({args.dp_degree})." - ) - - local_dp_batch_size = args.global_batch_size // args.dp_degree - local_dp_microbatch = args.microbatch_size * args.ep_degree - if local_dp_batch_size % local_dp_microbatch != 0: - raise ValueError( - f"local DP batch size ({local_dp_batch_size}) must be divisible by " - f"microbatch-size * ep-degree " - f"({args.microbatch_size} * {args.ep_degree} = {local_dp_microbatch})." - ) - - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - dist.init_process_group("nccl", device_id=device) - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - (args.dp_degree, args.ep_degree), - mesh_dim_names=("dp", "ep"), - ) - return device, mesh - - -def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor: - coordinate = mesh.get_coordinate() - if coordinate is None: - raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.") - dp_rank, _ep_rank = coordinate - local_dp_batch_size = args.global_batch_size // args.dp_degree - - generator = torch.Generator(device="cpu") - generator.manual_seed(args.seed) - tokens = torch.randint( - 0, - vocab_size, - (args.global_batch_size, args.seq_len + 1), - generator=generator, - dtype=torch.long, - ) - - start = dp_rank * local_dp_batch_size - stop = start + local_dp_batch_size - return tokens[start:stop].to(device, non_blocking=True) - - -def vocab_parallel_cross_entropy( - logits: torch.Tensor, - labels: torch.Tensor, - *, - vocab_size: int, - vocab_group, - vocab_rank: int, - vocab_degree: int, - global_token_count: int, -) -> torch.Tensor: - if logits.shape[:2] != labels.shape: - raise ValueError( - f"logits shape {tuple(logits.shape)} is incompatible with " - f"labels shape {tuple(labels.shape)}." - ) - - local_vocab_size = logits.shape[-1] - vocab_start = vocab_rank * local_vocab_size - vocab_stop = vocab_start + local_vocab_size - if vocab_rank == vocab_degree - 1: - vocab_stop = vocab_size - - logits = logits.float() - local_max = logits.amax(dim=-1) - with torch.no_grad(): - global_max = local_max.detach().clone() - dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=vocab_group) - - shifted_logits = logits - global_max.unsqueeze(-1) - local_exp_sum = shifted_logits.exp().sum(dim=-1) - global_exp_sum = dist_nn_func.all_reduce( - local_exp_sum, - op=dist.ReduceOp.SUM, - group=vocab_group, - ) - - target_mask = (labels >= vocab_start) & (labels < vocab_stop) - local_target = torch.zeros_like(labels, dtype=torch.long) - local_target[target_mask] = labels[target_mask] - vocab_start - local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) - local_target_logits = local_target_logits * target_mask.to(logits.dtype) - target_logits = dist_nn_func.all_reduce( - local_target_logits, - op=dist.ReduceOp.SUM, - group=vocab_group, - ) - - loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() - return loss_sum / (global_token_count * vocab_degree) - - -def chunk_ranges(size: int, chunk_size: int): - if chunk_size <= 0: - yield 0, size - return - for start in range(0, size, chunk_size): - yield start, min(start + chunk_size, size) - - -def print_rank0(message: str) -> None: - if dist.get_rank() == 0: - print(message, flush=True) - - -def print_cuda_memory(stage: str, device: torch.device) -> None: - allocated = torch.cuda.memory_allocated(device) / 1024**3 - reserved = torch.cuda.memory_reserved(device) / 1024**3 - max_reserved = torch.cuda.max_memory_reserved(device) / 1024**3 - print_rank0( - f"{stage}: cuda allocated={allocated:.2f}GiB " - f"reserved={reserved:.2f}GiB max_reserved={max_reserved:.2f}GiB" - ) - - -def main(): - args = parse_args() - logging.basicConfig(level=logging.DEBUG) - - device, mesh = init_distributed(args) - ep_group = mesh.get_group("ep") - ep_rank = mesh.get_local_rank("ep") - local_dp_batch_size = args.global_batch_size // args.dp_degree - local_dp_microbatch = args.microbatch_size * args.ep_degree - gradient_accumulation_steps = local_dp_batch_size // local_dp_microbatch - - torch.manual_seed(args.seed) - model_args = make_model_args(args.flavor, args.seq_len) - if args.seq_len is None: - args.seq_len = model_args.max_seq_len - if model_args.num_experts % args.ep_degree != 0: - raise ValueError( - f"num_experts ({model_args.num_experts}) must be divisible by " - f"ep-degree ({args.ep_degree})." - ) - trace_global_batch_size = args.microbatch_size * args.dp_degree * args.ep_degree - - with torch.device("meta"): - model = Transformer(model_args, mesh=mesh, moe_axis_name="ep") - - def input_fn(): - return torch.randint( - 0, - model_args.vocab_size, - (trace_global_batch_size, args.seq_len), - device=device, - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - ) - - print_rank0( - f"Qwen3 {args.flavor} sanity check: " - f"mesh=(dp={args.dp_degree}, ep={args.ep_degree}), " - f"global_batch={args.global_batch_size}, " - f"local_dp_batch={local_dp_batch_size}, " - f"per_rank_microbatch={args.microbatch_size}, " - f"local_dp_microbatch={local_dp_microbatch}, " - f"grad_accum={gradient_accumulation_steps}, " - f"trace_global_batch={trace_global_batch_size}, " - f"seq_len={args.seq_len}, " - f"loss_chunk_size={args.loss_chunk_size}, " - f"optimizer={args.optimizer}" - ) - - t0 = time.time() - with AutoParallel( - model, - input_fn, - mesh, - mp_policy, - dynamic=True, - repeated_subgraphs=True, - ) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([(Shard(0), Shard(0))]) - autop.add_output_constraints([(Shard(0), Shard(2))]) - sharding_placement = autop.optimize_placement(verbose=args.verbose) - parallel_mod = autop.apply_placement(sharding_placement) - - print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s") - print_cuda_memory("after AutoParallel", device) - - parallel_mod.to_empty(device=device) - print_cuda_memory("after to_empty", device) - parallel_mod.init_weights(buffer_device=device, seed=args.seed) # type: ignore[operator] - print_cuda_memory("after init_weights", device) - - if args.compile: - parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) - - batch = make_local_tokens(args, mesh, device, model_args.vocab_size) - inputs = batch[:, :-1].contiguous() - labels = batch[:, 1:].contiguous() - - ep_coordinate = mesh.get_coordinate()[1] - input_microbatches = [] - label_microbatches = [] - for start in range(0, local_dp_batch_size, local_dp_microbatch): - stop = start + local_dp_microbatch - input_block = inputs[start:stop] - input_start = ep_coordinate * args.microbatch_size - input_stop = input_start + args.microbatch_size - input_microbatches.append(input_block[input_start:input_stop].contiguous()) - label_microbatches.append(labels[start:stop].contiguous()) - - global_token_count = args.global_batch_size * args.seq_len - if args.optimizer == "adamw": - optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) - elif args.optimizer == "sgd": - optimizer = torch.optim.SGD(parallel_mod.parameters(), lr=args.lr) - else: - optimizer = None - - try: - losses: list[float] = [] - for step in range(args.train_steps): - if optimizer is not None: - optimizer.zero_grad(set_to_none=True) - else: - parallel_mod.zero_grad(set_to_none=True) - step_loss = torch.zeros((), device=device) - for micro_inputs, micro_labels in zip( - input_microbatches, label_microbatches - ): - logits = parallel_mod(micro_inputs) - - seq_ranges = list(chunk_ranges(logits.shape[1], args.loss_chunk_size)) - for chunk_idx, (seq_start, seq_stop) in enumerate(seq_ranges): - logits_chunk = logits[:, seq_start:seq_stop] - labels_chunk = micro_labels[:, seq_start:seq_stop] - loss = vocab_parallel_cross_entropy( - logits_chunk, - labels_chunk, - vocab_size=model_args.vocab_size, - vocab_group=ep_group, - vocab_rank=ep_rank, - vocab_degree=args.ep_degree, - global_token_count=global_token_count, - ) - if torch.any(torch.isnan(loss)): - raise RuntimeError("Found NaNs in Qwen3 MoE training loss.") - - retain_graph = chunk_idx != len(seq_ranges) - 1 - loss.backward(retain_graph=retain_graph) - step_loss = step_loss + loss.detach() - - torch.nn.utils.clip_grad_norm_( - parallel_mod.parameters(), args.max_grad_norm - ) - if optimizer is not None: - optimizer.step() - - with torch.no_grad(): - logged_loss = step_loss.clone() - dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM) - loss_value = float(logged_loss.item()) - losses.append(loss_value) - print_rank0(f"step={step:03d} loss={loss_value:.6f}") - print_cuda_memory(f"after step {step:03d}", device) - - if ( - not args.skip_loss_improvement_check - and len(losses) > 1 - and losses[-1] >= losses[0] - ): - raise RuntimeError( - f"Qwen3 MoE training loss did not improve: " - f"initial={losses[0]:.6f}, final={losses[-1]:.6f}" - ) - - if len(losses) > 1: - print_rank0( - f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}" - ) - dist.barrier(device_ids=[device.index]) - torch.cuda.synchronize(device) - finally: - if dist.is_initialized(): - dist.destroy_process_group() - - -if __name__ == "__main__": - main() diff --git a/examples/example_torchtitan_qwen3_dense.py b/examples/example_torchtitan_qwen3_dense.py deleted file mode 100644 index a4685d1b..00000000 --- a/examples/example_torchtitan_qwen3_dense.py +++ /dev/null @@ -1,370 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import dataclasses -import logging -import os -import sys -import time -from pathlib import Path - -import torch -import torch.distributed as dist -import torch.distributed.nn.functional as dist_nn_func -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor.placement_types import Replicate, Shard - -from autoparallel.api import AutoParallel -from autoparallel.compile import autoparallel_backend - - -def _add_sibling_torchtitan_to_path() -> None: - repo_root = Path(__file__).resolve().parents[1] - torchtitan_root = repo_root.parent / "torchtitan" - if torchtitan_root.exists(): - sys.path.insert(0, str(torchtitan_root)) - - -_add_sibling_torchtitan_to_path() - -from torchtitan.models.qwen3 import Qwen3Model, qwen3_configs # noqa: E402 - - -def parse_args(): - parser = argparse.ArgumentParser( - description=( - "Run torchtitan's dense Qwen3 model through AutoParallel's " - "searched placement on real GPUs." - ) - ) - parser.add_argument( - "--flavor", - choices=("debugmodel", "debugmodel_fused_qkv", "0.6B", "1.7B", "4B", "8B"), - default="8B", - help="Dense torchtitan Qwen3 flavor.", - ) - parser.add_argument( - "--global-batch-size", - type=int, - default=4, - help="Global batch size across data-parallel ranks.", - ) - parser.add_argument( - "--microbatch-size", - type=int, - default=1, - help="Per-DP-rank microbatch size for gradient accumulation.", - ) - parser.add_argument( - "--seq-len", - type=int, - default=2048, - help="Sequence length for the real sanity run.", - ) - parser.add_argument( - "--dp-degree", - type=int, - default=2, - help="Data-parallel mesh degree.", - ) - parser.add_argument( - "--tp-degree", - type=int, - default=2, - help="Tensor-parallel mesh degree.", - ) - parser.add_argument( - "--train-steps", - type=int, - default=2, - help="Number of optimizer steps.", - ) - parser.add_argument( - "--lr", - type=float, - default=3e-4, - help="AdamW learning rate.", - ) - parser.add_argument( - "--max-grad-norm", - type=float, - default=1.0, - help="Gradient clipping max norm.", - ) - parser.add_argument( - "--seed", - type=int, - default=0, - help="Seed for model initialization and synthetic data generation.", - ) - parser.add_argument( - "--compile", - action="store_true", - help="Compile the placed module with the AutoParallel backend before training.", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="Print the full AutoParallel optimizer log.", - ) - return parser.parse_args() - - -def make_model_config(flavor: str, seq_len: int) -> Qwen3Model.Config: - config = qwen3_configs[flavor](attn_backend="sdpa") - config.rope = dataclasses.replace(config.rope, max_seq_len=seq_len) - return config - - -def init_distributed(args): - if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ: - raise RuntimeError( - "Run this example with torchrun, e.g. " - "torchrun --standalone --nproc-per-node 4 " - "examples/example_torchtitan_qwen3_dense.py" - ) - - world_size = int(os.environ["WORLD_SIZE"]) - local_rank = int(os.environ["LOCAL_RANK"]) - expected_world_size = args.dp_degree * args.tp_degree - if world_size != expected_world_size: - raise ValueError( - f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree " - f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})." - ) - if args.global_batch_size % args.dp_degree != 0: - raise ValueError( - f"global-batch-size ({args.global_batch_size}) must be divisible by " - f"dp-degree ({args.dp_degree})." - ) - local_batch_size = args.global_batch_size // args.dp_degree - if local_batch_size % args.microbatch_size != 0: - raise ValueError( - f"local batch size ({local_batch_size}) must be divisible by " - f"microbatch-size ({args.microbatch_size})." - ) - - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - dist.init_process_group("nccl", device_id=device) - mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - (args.dp_degree, args.tp_degree), - mesh_dim_names=("dp", "tp"), - ) - return device, mesh - - -def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor: - coordinate = mesh.get_coordinate() - if coordinate is None: - raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.") - dp_rank, _tp_rank = coordinate - local_batch_size = args.global_batch_size // args.dp_degree - - generator = torch.Generator(device="cpu") - generator.manual_seed(args.seed) - tokens = torch.randint( - 0, - vocab_size, - (args.global_batch_size, args.seq_len + 1), - generator=generator, - dtype=torch.long, - ) - - start = dp_rank * local_batch_size - stop = start + local_batch_size - return tokens[start:stop].to(device, non_blocking=True) - - -def vocab_parallel_cross_entropy( - logits: torch.Tensor, - labels: torch.Tensor, - *, - vocab_size: int, - tp_group, - tp_rank: int, - tp_degree: int, - global_token_count: int, -) -> torch.Tensor: - if logits.shape[:2] != labels.shape: - raise ValueError( - f"logits shape {tuple(logits.shape)} is incompatible with " - f"labels shape {tuple(labels.shape)}." - ) - - local_vocab_size = logits.shape[-1] - vocab_start = tp_rank * local_vocab_size - vocab_stop = vocab_start + local_vocab_size - if tp_rank == tp_degree - 1: - vocab_stop = vocab_size - - logits = logits.float() - local_max = logits.amax(dim=-1) - with torch.no_grad(): - global_max = local_max.detach().clone() - dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group) - - shifted_logits = logits - global_max.unsqueeze(-1) - local_exp_sum = shifted_logits.exp().sum(dim=-1) - global_exp_sum = dist_nn_func.all_reduce( - local_exp_sum, - op=dist.ReduceOp.SUM, - group=tp_group, - ) - - target_mask = (labels >= vocab_start) & (labels < vocab_stop) - local_target = torch.zeros_like(labels, dtype=torch.long) - local_target[target_mask] = labels[target_mask] - vocab_start - local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1) - local_target_logits = local_target_logits * target_mask.to(logits.dtype) - target_logits = dist_nn_func.all_reduce( - local_target_logits, - op=dist.ReduceOp.SUM, - group=tp_group, - ) - - loss_sum = (global_exp_sum.log() + global_max - target_logits).sum() - return loss_sum / (global_token_count * tp_degree) - - -def print_rank0(message: str) -> None: - if dist.get_rank() == 0: - print(message, flush=True) - - -def main(): - args = parse_args() - logging.basicConfig(level=logging.DEBUG) - - device, mesh = init_distributed(args) - tp_group = mesh.get_group("tp") - tp_rank = mesh.get_local_rank("tp") - local_batch_size = args.global_batch_size // args.dp_degree - gradient_accumulation_steps = local_batch_size // args.microbatch_size - - torch.manual_seed(args.seed) - model_config = make_model_config(args.flavor, args.seq_len) - vocab_size = model_config.vocab_size - - with torch.device("meta"): - model = model_config.build() - - def input_fn(): - return torch.randint( - 0, - vocab_size, - (args.global_batch_size, args.seq_len), - device=device, - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - ) - - print_rank0( - f"torchtitan Qwen3 {args.flavor} via AutoParallel: " - f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), " - f"global_batch={args.global_batch_size}, " - f"local_batch={local_batch_size}, " - f"microbatch={args.microbatch_size}, " - f"grad_accum={gradient_accumulation_steps}, " - f"seq_len={args.seq_len}" - ) - - t0 = time.time() - with AutoParallel( - model, - input_fn, - mesh, - mp_policy, - repeated_subgraphs=True, - ) as autop: - autop.add_parameter_memory_constraint(low=None, high=None) - autop.add_input_constraints([(Shard(0), Replicate())]) - autop.add_output_constraints([(Shard(0), Shard(2))]) - sharding_placement = autop.optimize_placement(verbose=args.verbose) - parallel_mod = autop.apply_placement(sharding_placement) - - print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s") - - parallel_mod.to_empty(device=device) - torch.manual_seed(args.seed) - parallel_mod.init_weights(buffer_device=device) # type: ignore[operator] - - if args.compile: - parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend()) - - batch = make_local_tokens(args, mesh, device, vocab_size) - inputs = batch[:, :-1].contiguous() - labels = batch[:, 1:].contiguous() - input_microbatches = torch.split(inputs, args.microbatch_size, dim=0) - label_microbatches = torch.split(labels, args.microbatch_size, dim=0) - - global_token_count = args.global_batch_size * args.seq_len - optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr) - - try: - losses: list[float] = [] - for step in range(args.train_steps): - optimizer.zero_grad(set_to_none=True) - step_loss = torch.zeros((), device=device) - for micro_inputs, micro_labels in zip( - input_microbatches, label_microbatches - ): - logits = parallel_mod(micro_inputs) - if torch.any(torch.isnan(logits)): - raise RuntimeError("Found NaNs in forward output.") - - loss = vocab_parallel_cross_entropy( - logits, - micro_labels, - vocab_size=vocab_size, - tp_group=tp_group, - tp_rank=tp_rank, - tp_degree=args.tp_degree, - global_token_count=global_token_count, - ) - if torch.any(torch.isnan(loss)): - raise RuntimeError("Found NaNs in training loss.") - - loss.backward() - step_loss = step_loss + loss.detach() - - torch.nn.utils.clip_grad_norm_( - parallel_mod.parameters(), args.max_grad_norm - ) - optimizer.step() - - with torch.no_grad(): - logged_loss = step_loss.clone() - dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM) - loss_value = float(logged_loss.item()) - losses.append(loss_value) - print_rank0(f"step={step:03d} loss={loss_value:.6f}") - - if len(losses) > 1 and losses[-1] >= losses[0]: - raise RuntimeError( - f"Training loss did not improve: " - f"initial={losses[0]:.6f}, final={losses[-1]:.6f}" - ) - - if len(losses) > 1: - print_rank0( - f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}" - ) - else: - print_rank0(f"Completed one step: loss={losses[0]:.6f}") - dist.barrier(device_ids=[device.index]) - torch.cuda.synchronize(device) - finally: - if dist.is_initialized(): - dist.destroy_process_group() - - -if __name__ == "__main__": - main() diff --git a/tests/conftest.py b/tests/conftest.py index d5d23ea1..22af2357 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,6 +40,16 @@ def apply_cuda_patches(func): return func +@pytest.fixture(autouse=True) +def _reset_placement_options_cache(): + """The placement-options cache is a process-global; clear it before each test + so optimizer builds never reuse stale strategies from a prior test's model.""" + from autoparallel.shardings.placement_options import reset_placement_options_cache + + reset_placement_options_cache() + yield + + @pytest.fixture(scope="module", autouse=True) def init_pg(): world_size = 256 diff --git a/tests/test_dsv3_torchtitan_config.py b/tests/test_dsv3_torchtitan_config.py deleted file mode 100644 index e009206b..00000000 --- a/tests/test_dsv3_torchtitan_config.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import sys -from pathlib import Path - -import pytest -import torch - -from autoparallel._testing.models.dsv3 import DeepSeekV3Model - - -def test_dsv3_accepts_torchtitan_grouped_experts_config(): - torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan" - if not torchtitan_root.exists(): - pytest.skip("torchtitan sibling checkout not found") - sys.path.insert(0, str(torchtitan_root)) - - try: - from torchtitan.models.deepseek_v3 import deepseekv3_configs # type: ignore[import-not-found] - except Exception as exc: - pytest.skip(f"torchtitan DeepSeek-V3 config unavailable: {exc}") - - with torch.device("meta"): - model = DeepSeekV3Model( - deepseekv3_configs["debugmodel"]( - attn_backend="sdpa", - moe_comm_backend="standard", - ) - ) - - moe_layer = next(layer for layer in model.layers.values() if layer.moe_enabled) - assert moe_layer.moe.experts.use_grouped_mm diff --git a/tests/test_optimize_placement.py b/tests/test_optimize_placement.py index 9325f1f5..20ac9d95 100644 --- a/tests/test_optimize_placement.py +++ b/tests/test_optimize_placement.py @@ -867,9 +867,7 @@ def test_invalid_strategies_are_pruned(device_mesh_2d): assert all(k in opt._valid_keys for k in opt.decision_vars) # No inf-cost (== 0) constraints should be emitted any more. - assert not any( - name.startswith("inf_cases") for name in opt.prob.constraints - ) + assert not any(name.startswith("inf_cases") for name in opt.prob.constraints) # The pruned problem must still solve to a valid solution. solution = autop.optimize_placement() diff --git a/tests/test_qwen3.py b/tests/test_qwen3.py deleted file mode 100644 index 5b32bc5b..00000000 --- a/tests/test_qwen3.py +++ /dev/null @@ -1,323 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -import sys -from pathlib import Path - -import pytest -import torch -from torch.distributed.fsdp import MixedPrecisionPolicy -from torch.distributed.tensor import DTensor -from torch.distributed.tensor.placement_types import Replicate, Shard - -from autoparallel._testing.models.qwen3 import ( - Qwen3ModelArgs, - Transformer, - apply_rotary_emb_cos_sin, - qwen3_debug_args, - qwen3_args_from_torchtitan_config, - qwen3_moe_debug_args, -) -from autoparallel.api import AutoParallel, auto_parallel - - -def _tiny_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=64, - n_layers=2, - n_heads=4, - n_kv_heads=2, - head_dim=16, - hidden_dim=128, - vocab_size=128, - max_seq_len=16, - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def _tiny_moe_args(**overrides) -> Qwen3ModelArgs: - args = Qwen3ModelArgs( - dim=32, - n_layers=1, - n_heads=4, - n_kv_heads=2, - head_dim=8, - hidden_dim=64, - vocab_size=64, - max_seq_len=4, - moe_enabled=True, - moe_hidden_dim=16, - num_experts=64, - top_k=8, - route_norm=True, - score_before_experts=False, - moe_axis_name="tp", - ) - for key, value in overrides.items(): - setattr(args, key, value) - args.__post_init__() - return args - - -def test_qwen3_forward_shape(): - args = _tiny_args() - model = Transformer(args) - model.init_weights(seed=0) - - tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len)) - logits = model(tokens) - - assert logits.shape == (2, args.max_seq_len, args.vocab_size) - - -def test_qwen3_qk_norm_changes_logits(): - args = _tiny_args(n_layers=1) - model = Transformer(args) - model.init_weights(seed=0) - - tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len)) - logits = model(tokens) - - with torch.no_grad(): - model.layers["0"].attention.q_norm.weight.zero_() - logits_without_q = model(tokens) - - assert not torch.allclose(logits, logits_without_q) - - -def test_qwen3_weight_tying_survives_init_weights(): - args = _tiny_args(enable_weight_tying=True) - model = Transformer(args) - - assert model.tok_embeddings.weight is model.lm_head.weight - model.init_weights(seed=0) - assert model.tok_embeddings.weight is model.lm_head.weight - - -def test_qwen3_debug_args_matches_torchtitan_dense_shape(): - args = qwen3_debug_args(max_seq_len=32) - - assert args.dim == 256 - assert args.n_layers == 8 - assert args.n_heads == 16 - assert args.n_kv_heads == 8 - assert args.head_dim == 128 - assert args.hidden_dim == 3072 - assert args.vocab_size == 2048 - assert args.rope_theta == 1000000.0 - assert args.enable_weight_tying - - -def test_qwen3_moe_debug_args_matches_torchtitan_shape(): - args = qwen3_moe_debug_args(max_seq_len=32) - - assert args.dim == 256 - assert args.n_layers == 8 - assert args.n_heads == 16 - assert args.n_kv_heads == 8 - assert args.head_dim == 128 - assert args.moe_enabled - assert args.moe_hidden_dim == 768 - assert args.num_experts == 64 - assert args.top_k == 8 - assert args.route_norm - assert not args.score_before_experts - - -@pytest.mark.parametrize( - ("flavor", "expected"), - [ - ( - "8B", - { - "dim": 4096, - "n_layers": 36, - "n_heads": 32, - "n_kv_heads": 8, - "head_dim": 128, - "hidden_dim": 12288, - "vocab_size": 151936, - "moe_enabled": False, - "num_experts": 0, - "top_k": 1, - "max_seq_len": 4096, - }, - ), - ( - "30B-A3B", - { - "dim": 2048, - "n_layers": 48, - "n_heads": 32, - "n_kv_heads": 4, - "head_dim": 128, - "hidden_dim": 0, - "vocab_size": 151936, - "moe_enabled": True, - "moe_hidden_dim": 768, - "num_experts": 128, - "top_k": 8, - "route_norm": True, - "score_before_experts": False, - "max_seq_len": 262144, - }, - ), - ], -) -def test_qwen3_args_from_torchtitan_config(flavor, expected): - torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan" - if not torchtitan_root.exists(): - pytest.skip("torchtitan sibling checkout not found") - sys.path.insert(0, str(torchtitan_root)) - - try: - from torchtitan.models.qwen3 import qwen3_configs # type: ignore[import-not-found] - except Exception as exc: - pytest.skip(f"torchtitan Qwen3 config unavailable: {exc}") - - args = qwen3_args_from_torchtitan_config( - qwen3_configs[flavor](attn_backend="sdpa") - ) - - for attr, value in expected.items(): - assert getattr(args, attr) == value - assert args.rope_theta == 1000000.0 - assert args.norm_eps == 1e-6 - - -def test_qwen3_cos_sin_rope_matches_torchtitan_helper(): - torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan" - if not torchtitan_root.exists(): - pytest.skip("torchtitan sibling checkout not found") - sys.path.insert(0, str(torchtitan_root)) - - try: - from torchtitan.models.common.rope import ( # type: ignore[import-not-found] - RoPE, - apply_rotary_emb_cos_sin as tt_apply_rotary_emb_cos_sin, - ) - except Exception as exc: - pytest.skip(f"torchtitan Qwen3 RoPE helper unavailable: {exc}") - - args = _tiny_args() - rope = RoPE( - RoPE.Config( - dim=args.head_dim, - max_seq_len=args.max_seq_len, - theta=args.rope_theta, - backend="cos_sin", - ) - ) - xq = torch.randn(2, args.max_seq_len, args.n_heads, args.head_dim) - xk = torch.randn(2, args.max_seq_len, args.n_kv_heads, args.head_dim) - - actual = apply_rotary_emb_cos_sin(xq, xk, rope.cache) - expected = tt_apply_rotary_emb_cos_sin(xq, xk, rope.cache) - - torch.testing.assert_close(actual[0], expected[0]) - torch.testing.assert_close(actual[1], expected[1]) - - -def test_qwen3_autoparallel_pipeline_smoke(device_mesh_2d): - args = _tiny_args(n_layers=2, max_seq_len=8) - batch_size = 2 * device_mesh_2d.shape[0] - - with torch.device("meta"): - model = Transformer(args) - - def input_fn(): - return torch.randint( - 0, - args.vocab_size, - (batch_size, args.max_seq_len), - device="cuda", - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - ) - - with AutoParallel( - model, - input_fn, - device_mesh_2d, - mp_policy, - repeated_subgraphs=True, - ) as autop: - autop.add_input_constraints([(Shard(0), Replicate())]) - autop.add_output_constraints([(Shard(0), Shard(2))]) - sharding_placement = autop.optimize_placement(verbose=False) - parallel_mod = autop.apply_placement(sharding_placement) - - assert isinstance(parallel_mod, Transformer) - - -def test_qwen3_moe_auto_parallel_smoke(device_mesh_2d): - args = _tiny_moe_args() - local_batch_size = 1 - - with torch.device("meta"): - model = Transformer(args, mesh=device_mesh_2d, moe_axis_name="tp") - - expected_param_shapes = { - name: tuple(param.shape) for name, param in model.named_parameters() - } - expected_nparams = sum(param.numel() for param in model.parameters()) - - tokens = DTensor.from_local( - torch.randint( - 0, - args.vocab_size, - (local_batch_size, args.max_seq_len), - device="cuda", - ), - device_mesh_2d, - [Shard(0), Shard(0)], - ) - - mp_policy = MixedPrecisionPolicy( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - ) - parallel_mod = auto_parallel( - model, - device_mesh_2d, - sample_inputs=(tokens,), - out_shardings=(Shard(0), Shard(2)), - mp_policy=mp_policy, - dynamic=True, - ) - - assert isinstance(parallel_mod, Transformer) - assert sum(param.numel() for param in parallel_mod.parameters()) == expected_nparams - assert { - name: tuple(param.shape) for name, param in parallel_mod.named_parameters() - } == expected_param_shapes - assert parallel_mod.layers["0"].moe.experts.w1.shape == ( - args.num_experts, - args.moe_hidden_dim, - args.dim, - ) - - parallel_mod.to_empty(device="cuda") - parallel_mod.init_weights(buffer_device=torch.device("cuda"), seed=0) - - local_tokens = torch.randint( - 0, - args.vocab_size, - (local_batch_size, args.max_seq_len), - device="cuda", - ) - out = parallel_mod(local_tokens) - assert out.shape == ( - local_batch_size * device_mesh_2d.shape[1], - args.max_seq_len, - args.vocab_size // device_mesh_2d.shape[1], - ) - out.backward(torch.randn_like(out)) From 8eda68d266fa7667d0fe432774ac7c78324a6d6b Mon Sep 17 00:00:00 2001 From: Kaijian Wang Date: Mon, 8 Jun 2026 12:37:07 -0700 Subject: [PATCH 27/27] Fall back to serial build when CUDA is initialized (fix fork crash) The parallel decision-var cost build forks workers, but forking a process that has already initialized CUDA crashes them with "Cannot re-initialize CUDA in forked subprocess" once they touch the NCCL cost model. Real-GPU runs (example scripts, torchrun) and CUDA-touching tests hit this. Skip the fork and use the byte-identical serial path whenever torch.cuda.is_initialized(). Authored with Claude. --- autoparallel/optimize_sharding.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py index b14cb142..c3832c87 100644 --- a/autoparallel/optimize_sharding.py +++ b/autoparallel/optimize_sharding.py @@ -1127,7 +1127,16 @@ def _compute_node_edge_costs(self, root_idxs): global _FORK_OPT _FORK_OPT = self try: - if _PARALLEL_BUILD_WORKERS <= 1 or len(root_idxs) < 64: + # Forking a process that has already initialized CUDA crashes the + # workers ("Cannot re-initialize CUDA in forked subprocess") once they + # touch the NCCL cost model. Real-GPU runs (examples, torchrun) and + # any test that has touched CUDA hit this, so fall back to the + # (byte-identical) serial path whenever CUDA is live. + if ( + _PARALLEL_BUILD_WORKERS <= 1 + or len(root_idxs) < 64 + or torch.cuda.is_initialized() + ): return [_par_node_edge_costs(ni) for ni in root_idxs] import multiprocessing as mp